From bfb909d754da8c02a4581a8b6f428efdfab8005e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sun, 5 Oct 2025 07:50:55 -0700 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.7 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 379 +++++++++++++++--- .../PhaseOrdering/X86/vector-reductions.ll | 9 +- .../Transforms/SLPVectorizer/AArch64/div.ll | 38 +- ...ather-buildvector-with-minbitwidth-user.ll | 92 ++--- .../AArch64/reused-scalar-repeated-in-node.ll | 18 +- .../SLPVectorizer/RISCV/complex-loads.ll | 14 +- .../RISCV/partial-vec-invalid-cost.ll | 16 +- .../RISCV/reordered-buildvector-scalars.ll | 123 +++--- .../Transforms/SLPVectorizer/X86/PR40310.ll | 10 +- .../X86/alternate-int-inseltpoison.ll | 17 +- .../SLPVectorizer/X86/alternate-int.ll | 17 +- .../X86/arith-fp-inseltpoison.ll | 51 ++- .../Transforms/SLPVectorizer/X86/arith-fp.ll | 51 ++- .../SLPVectorizer/X86/buildvector-shuffle.ll | 12 +- .../X86/buildvectors-parent-phi-nodes.ll | 18 +- .../Transforms/SLPVectorizer/X86/c-ray.ll | 326 +++++++++++---- .../X86/delayed-gather-emission.ll | 18 +- .../entry-no-bundle-but-extra-use-on-vec.ll | 19 +- .../SLPVectorizer/X86/gather-with-cmp-user.ll | 13 +- .../original-inst-scheduled-after-copyable.ll | 26 +- .../SLPVectorizer/X86/phi-node-with-cycle.ll | 23 +- .../Transforms/SLPVectorizer/X86/pr46983.ll | 163 ++++++-- .../SLPVectorizer/X86/reduction2.ll | 18 +- .../X86/reschedule-only-scheduled.ll | 32 +- ...same-last-instruction-different-parents.ll | 19 +- .../SLPVectorizer/X86/scalarize-ctlz.ll | 77 ++-- .../X86/split-node-reorder-node-with-ops.ll | 30 +- .../subvector-minbitwidth-unsigned-value.ll | 18 +- .../trunced-buildvector-scalar-extended.ll | 9 +- .../X86/vec3-reorder-reshuffle.ll | 18 +- .../gather_extract_from_vectorbuild.ll | 35 +- .../vectorize-reorder-alt-shuffle.ll | 61 ++- 32 files changed, 1160 insertions(+), 610 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fedca65d241e8..b633dd4d9fdb0 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2005,9 +2005,15 @@ class BoUpSLP { /// holding live values over call sites. InstructionCost getSpillCost(); + /// Calculates the cost of the subtrees, trims non-profitable ones and returns + /// final cost. + InstructionCost + calculateTreeCostAndTrimNonProfitable(ArrayRef VectorizedVals = {}); + /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(ArrayRef VectorizedVals = {}, + InstructionCost getTreeCost(InstructionCost TreeCost, + ArrayRef VectorizedVals = {}, InstructionCost ReductionCost = TTI::TCC_Free); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for @@ -2080,6 +2086,8 @@ class BoUpSLP { void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntries.clear(); + DeletedNodes.clear(); + TransformedToGatherNodes.clear(); OperandsToTreeEntry.clear(); ScalarsInSplitNodes.clear(); MustGather.clear(); @@ -4511,6 +4519,13 @@ class BoUpSLP { /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; + /// List of deleted non-profitable nodes. + SmallPtrSet DeletedNodes; + + /// List of nodes, transformed to gathered, with their conservative + /// gather/buildvector cost estimation. + SmallDenseMap TransformedToGatherNodes; + /// Maps the operand index and entry to the corresponding tree entry. SmallDenseMap, TreeEntry *> OperandsToTreeEntry; @@ -8697,7 +8712,9 @@ void BoUpSLP::buildExternalUses( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize || + DeletedNodes.contains(Entry) || + TransformedToGatherNodes.contains(Entry)) continue; // For each lane: @@ -8744,7 +8761,11 @@ void BoUpSLP::buildExternalUses( // Skip in-tree scalars that become vectors if (ArrayRef UseEntries = getTreeEntries(U); - !UseEntries.empty()) { + !UseEntries.empty() && + any_of(UseEntries, [this](const TreeEntry *UseEntry) { + return !DeletedNodes.contains(UseEntry) && + !TransformedToGatherNodes.contains(UseEntry); + })) { // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in FoundLane will // be used. @@ -8752,6 +8773,9 @@ void BoUpSLP::buildExternalUses( isa(UserInst)) || isa(UserInst)) || all_of(UseEntries, [&](TreeEntry *UseEntry) { + if (DeletedNodes.contains(UseEntry) || + TransformedToGatherNodes.contains(UseEntry)) + return true; return UseEntry->State == TreeEntry::ScatterVectorize || !doesInTreeUserNeedToExtract( Scalar, getRootEntryInstruction(*UseEntry), TLI, @@ -14208,7 +14232,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, unsigned EntryVF = E->getVectorFactor(); auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF); - if (E->isGather()) { + if (E->isGather() || TransformedToGatherNodes.contains(E)) { if (allConstant(VL)) return 0; if (isa(VL[0])) @@ -15892,26 +15916,16 @@ static T *performExtractsShuffleAction( return Prev; } -namespace { -/// Data type for handling buildvector sequences with the reused scalars from -/// other tree entries. -template struct ShuffledInsertData { - /// List of insertelements to be replaced by shuffles. - SmallVector InsertElements; - /// The parent vectors and shuffle mask for the given list of inserts. - MapVector> ValueMasks; -}; -} // namespace - -InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, - InstructionCost ReductionCost) { - InstructionCost Cost = ReductionCost; +InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( + ArrayRef VectorizedVals) { + SmallDenseMap NodesCosts; + SmallPtrSet CheckedExtracts; + SmallPtrSet GatheredLoadsNodes; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); - - SmallPtrSet CheckedExtracts; - for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { - TreeEntry &TE = *VectorizableTree[I]; + InstructionCost Cost = 0; + for (const std::unique_ptr &Ptr : VectorizableTree) { + TreeEntry &TE = *Ptr; // No need to count the cost for combined entries, they are combined and // just skip their cost. if (TE.State == TreeEntry::CombinedVectorize) { @@ -15919,6 +15933,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, dbgs() << "SLP: Skipping cost for combined node that starts with " << *TE.Scalars[0] << ".\n"; TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); + NodesCosts.try_emplace(&TE); continue; } if (TE.hasState() && @@ -15931,6 +15946,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle " << shortBundleName(TE.Scalars, TE.Idx) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); + NodesCosts.try_emplace(&TE); continue; } } @@ -15942,11 +15958,202 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; + NodesCosts.try_emplace(&TE, C); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " << shortBundleName(TE.Scalars, TE.Idx) << ".\n" << "SLP: Current total cost = " << Cost << "\n"); + // Add gathered loads nodes to the set for later processing. + if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() && + TE.getOpcode() == Instruction::Load) + GatheredLoadsNodes.insert(&TE); + } + // Bail out if the cost threshold is negative and cost already below it. + if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 && + Cost < -SLPCostThreshold) + return Cost; + // Bail out, if gathered loads nodes are found. + // TODO: add analysis for gathered load to include their cost correctly into + // the related subtrees. + if (!GatheredLoadsNodes.empty()) + return Cost; + SmallVector>> SubtreeCosts( + VectorizableTree.size()); + for (const std::unique_ptr &Ptr : VectorizableTree) { + TreeEntry &TE = *Ptr; + InstructionCost C = NodesCosts.at(&TE); + SubtreeCosts[TE.Idx].first += C; + const TreeEntry *UserTE = TE.UserTreeIndex.UserTE; + while (UserTE) { + SubtreeCosts[UserTE->Idx].first += C; + SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx); + UserTE = UserTE->UserTreeIndex.UserTE; + } + } + using CostIndicesTy = + std::pair>>; + struct FirstGreater { + bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const { + return LHS.second.first < RHS.second.first || + (LHS.second.first == RHS.second.first && + LHS.first->Idx < RHS.first->Idx); + } + }; + PriorityQueue, FirstGreater> + Worklist; + for (const auto [Idx, P] : enumerate(SubtreeCosts)) + Worklist.emplace(VectorizableTree[Idx].get(), P); + + // Narrow store trees with non-profitable immediate values - exit. + if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < 4 && + VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::Store && + (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1)) + return Cost; + + bool Changed = false; + while (!Worklist.empty() && Worklist.top().second.first > 0) { + TreeEntry *TE = Worklist.top().first; + if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) { + Worklist.pop(); + continue; + } + + // Calculate the gather cost of the root node. + InstructionCost SubtreeCost = Worklist.top().second.first; + if (SubtreeCost < TE->Scalars.size()) { + Worklist.pop(); + continue; + } + if (!TransformedToGatherNodes.empty()) { + for (unsigned Idx : Worklist.top().second.second) { + auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get()); + if (It != TransformedToGatherNodes.end()) { + SubtreeCost -= SubtreeCosts[Idx].first; + SubtreeCost += It->second; + } + } + } + if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) { + Worklist.pop(); + continue; + } + const unsigned Sz = TE->Scalars.size(); + APInt DemandedElts = APInt::getAllOnes(Sz); + for (auto [Idx, V] : enumerate(TE->Scalars)) { + if (isConstant(V)) + DemandedElts.clearBit(Idx); + } + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + + Type *ScalarTy = getValueType(TE->Scalars.front()); + auto *VecTy = getWidenedType(ScalarTy, Sz); + const unsigned EntryVF = TE->getVectorFactor(); + auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF); + InstructionCost GatherCost = ::getScalarizationOverhead( + *TTI, ScalarTy, VecTy, DemandedElts, + /*Insert=*/true, /*Extract=*/false, CostKind); + SmallVector Mask; + if (!TE->ReorderIndices.empty() && + TE->State != TreeEntry::CompressVectorize && + (TE->State != TreeEntry::StridedVectorize || + !isReverseOrder(TE->ReorderIndices))) { + SmallVector NewMask; + if (TE->getOpcode() == Instruction::Store) { + // For stores the order is actually a mask. + NewMask.resize(TE->ReorderIndices.size()); + copy(TE->ReorderIndices, NewMask.begin()); + } else { + inversePermutation(TE->ReorderIndices, NewMask); + } + ::addMask(Mask, NewMask); + } + if (!TE->ReuseShuffleIndices.empty()) + ::addMask(Mask, TE->ReuseShuffleIndices); + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF)) + GatherCost += + ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); + // If all scalars are reused in gather node(s) or other vector nodes, there + // might be extra cost for inserting them. + if (all_of(TE->Scalars, [&](Value *V) { + return (TE->hasCopyableElements() && TE->isCopyableElement(V)) || + isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1; + })) + GatherCost *= 2; + // Erase subtree if it is non-profitable. + if (SubtreeCost > GatherCost) { + // If the remaining tree is just a buildvector - exit, it will cause + // enless attempts to vectorize. + if (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + TE->Idx == 1) + return InstructionCost::getInvalid(); + + LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node " + << TE->Idx << " with cost " + << Worklist.top().second.first << " and gather cost " + << GatherCost << ".\n"); + if (TE->UserTreeIndex) { + TransformedToGatherNodes.try_emplace(TE, GatherCost); + NodesCosts.erase(TE); + } else { + DeletedNodes.insert(TE); + TransformedToGatherNodes.erase(TE); + NodesCosts.erase(TE); + } + for (unsigned Idx : Worklist.top().second.second) { + TreeEntry &ChildTE = *VectorizableTree[Idx]; + DeletedNodes.insert(&ChildTE); + TransformedToGatherNodes.erase(&ChildTE); + NodesCosts.erase(&ChildTE); + } + Changed = true; + } + Worklist.pop(); + } + if (!Changed) + return SubtreeCosts.front().first; + + for (std::unique_ptr &TE : VectorizableTree) { + if (DeletedNodes.contains(TE.get())) + continue; + if (TransformedToGatherNodes.contains(TE.get()) && !TE->UserTreeIndex) { + assert(TE->getOpcode() == Instruction::Load && "Expected load only."); + continue; + } + if (!NodesCosts.contains(TE.get())) { + InstructionCost C = + getEntryCost(TE.get(), VectorizedVals, CheckedExtracts); + NodesCosts.try_emplace(TE.get(), C); + } } + LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n"); + Cost = 0; + for (const auto &P : NodesCosts){ + Cost += P.second; + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle " + << shortBundleName(P.first->Scalars, P.first->Idx) << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + } + return Cost; +} + +namespace { +/// Data type for handling buildvector sequences with the reused scalars from +/// other tree entries. +template struct ShuffledInsertData { + /// List of insertelements to be replaced by shuffles. + SmallVector InsertElements; + /// The parent vectors and shuffle mask for the given list of inserts. + MapVector> ValueMasks; +}; +} // namespace + +InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost, + ArrayRef VectorizedVals, + InstructionCost ReductionCost) { + InstructionCost Cost = TreeCost + ReductionCost; + if (Cost >= -SLPCostThreshold && none_of(ExternalUses, [](const ExternalUser &EU) { return isa_and_nonnull(EU.User); @@ -16243,8 +16450,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, for (Value *V : ScalarOpsFromCasts) { ExternalUsesAsOriginalScalar.insert(V); if (ArrayRef TEs = getTreeEntries(V); !TEs.empty()) { - ExternalUses.emplace_back(V, nullptr, *TEs.front(), - TEs.front()->findLaneForValue(V)); + const auto *It = find_if_not(TEs, [&](TreeEntry *TE) { + return TransformedToGatherNodes.contains(TE) || + DeletedNodes.contains(TE); + }); + if (It != TEs.end()) { + const TreeEntry *UserTE = *It; + ExternalUses.emplace_back(V, nullptr, *UserTE, + UserTE->findLaneForValue(V)); + } } } // Add reduced value cost, if resized. @@ -16710,8 +16924,22 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( continue; // Build a list of tree entries where V is used. SmallPtrSet VToTEs; - for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) { - if (TEPtr == TE || TEPtr->Idx == 0) + SmallVector GatherNodes( + ValueToGatherNodes.lookup(V).takeVector()); + if (TransformedToGatherNodes.contains(TE)) { + for (TreeEntry *E : getSplitTreeEntries(V)) { + if (TE == E || !TransformedToGatherNodes.contains(E)) + continue; + GatherNodes.push_back(E); + } + for (TreeEntry *E : getTreeEntries(V)) { + if (TE == E || !TransformedToGatherNodes.contains(E)) + continue; + GatherNodes.push_back(E); + } + } + for (const TreeEntry *TEPtr : GatherNodes) { + if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr)) continue; assert(any_of(TEPtr->Scalars, [&](Value *V) { return GatheredScalars.contains(V); }) && @@ -16787,8 +17015,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( VToTEs.insert(TEPtr); } if (ArrayRef VTEs = getSplitTreeEntries(V); !VTEs.empty()) { - const auto *It = find_if( - VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; }); + const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) { + return MTE != TE && MTE != TEUseEI.UserTE && + !DeletedNodes.contains(MTE); + }); if (It != VTEs.end()) { const TreeEntry *VTE = *It; if (none_of(TE->CombinedEntriesWithIndices, @@ -16804,28 +17034,34 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } } if (ArrayRef VTEs = getTreeEntries(V); !VTEs.empty()) { - const TreeEntry *VTE = VTEs.front(); - if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) && - VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) { - VTEs = VTEs.drop_front(); - // Iterate through all vectorized nodes. - const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) { - return MTE->State == TreeEntry::Vectorize; - }); - if (MIt == VTEs.end()) - continue; - VTE = *MIt; - } - if (none_of(TE->CombinedEntriesWithIndices, - [&](const auto &P) { return P.first == VTE->Idx; })) { - Instruction &LastBundleInst = getLastInstructionInBundle(VTE); - if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) - continue; + const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) { + return TE != MainTE && !DeletedNodes.contains(TE) && + !TransformedToGatherNodes.contains(TE); + }); + if (It != VTEs.end()) { + const TreeEntry *VTE = *It; + if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) && + VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) { + VTEs = VTEs.drop_front(); + // Iterate through all vectorized nodes. + const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) { + return MTE->State == TreeEntry::Vectorize; + }); + if (MIt == VTEs.end()) + continue; + VTE = *MIt; + } + if (none_of(TE->CombinedEntriesWithIndices, + [&](const auto &P) { return P.first == VTE->Idx; })) { + Instruction &LastBundleInst = getLastInstructionInBundle(VTE); + if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) + continue; + } + // The node is reused - exit. + if (CheckAndUseSameNode(VTE)) + break; + VToTEs.insert(VTE); } - // The node is reused - exit. - if (CheckAndUseSameNode(VTE)) - break; - VToTEs.insert(VTE); } if (VToTEs.empty()) continue; @@ -17658,7 +17894,12 @@ Value *BoUpSLP::gather( CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. if (isa(V)) { - if (ArrayRef Entries = getTreeEntries(V); !Entries.empty()) { + ArrayRef Entries = getTreeEntries(V); + const auto *It = find_if(Entries, [&](const TreeEntry *E) { + return !TransformedToGatherNodes.contains(E) && + !DeletedNodes.contains(E); + }); + if (It != Entries.end()) { // Find which lane we need to extract. User *UserOp = nullptr; if (Scalar != V) { @@ -17690,8 +17931,8 @@ Value *BoUpSLP::gather( UserOp = InsElt; } if (UserOp) { - unsigned FoundLane = Entries.front()->findLaneForValue(V); - ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane); + unsigned FoundLane = (*It)->findLaneForValue(V); + ExternalUses.emplace_back(V, UserOp, **It, FoundLane); } } } @@ -18312,7 +18553,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { template ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params) { - assert(E->isGather() && "Expected gather node."); + assert((E->isGather() || TransformedToGatherNodes.contains(E)) && + "Expected gather node."); unsigned VF = E->getVectorFactor(); bool NeedFreeze = false; @@ -18897,7 +19139,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (E->VectorizedValue) return E->VectorizedValue; auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size()); - if (E->isGather()) { + if (E->isGather() || TransformedToGatherNodes.contains(E)) { // Set insert point for non-reduction initial nodes. if (E->hasState() && E->Idx == 0 && !UserIgnoreList) setInsertPointAfterBundle(E); @@ -19966,7 +20208,7 @@ Value *BoUpSLP::vectorizeTree( // Cache last instructions for the nodes to avoid side effects, which may // appear during vectorization, like extra uses, etc. for (const std::unique_ptr &TE : VectorizableTree) { - if (TE->isGather()) + if (TE->isGather() || DeletedNodes.contains(TE.get())) continue; (void)getLastInstructionInBundle(TE.get()); } @@ -19980,6 +20222,8 @@ Value *BoUpSLP::vectorizeTree( // Vectorize gather operands of the nodes with the external uses only. SmallVector> GatherEntries; for (const std::unique_ptr &TE : VectorizableTree) { + if (DeletedNodes.contains(TE.get())) + continue; if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE && TE->UserTreeIndex.UserTE->hasState() && TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && @@ -20002,6 +20246,8 @@ Value *BoUpSLP::vectorizeTree( // Emit gathered loads first to emit better code for the users of those // gathered loads. for (const std::unique_ptr &TE : VectorizableTree) { + if (DeletedNodes.contains(TE.get())) + continue; if (GatheredLoadsEntriesFirst.has_value() && TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue && (!TE->isGather() || TE->UserTreeIndex)) { @@ -20513,7 +20759,9 @@ Value *BoUpSLP::vectorizeTree( TreeEntry *Entry = TEPtr.get(); // No need to handle users of gathered values. - if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) + if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize || + DeletedNodes.contains(Entry) || + TransformedToGatherNodes.contains(Entry)) continue; assert(Entry->VectorizedValue && "Can't find vectorizable value"); @@ -22718,14 +22966,15 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, R.reorderBottomToTop(); } R.transformNodes(); - R.buildExternalUses(); - R.computeMinimumValueSizes(); + InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable(); + R.buildExternalUses(); + Size = R.getCanonicalGraphSize(); if (S && S.getOpcode() == Instruction::Load) Size = 2; // cut off masked gather small trees - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(TreeCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { @@ -23373,10 +23622,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, R.reorderBottomToTop(!isa(Ops.front())); } R.transformNodes(); + R.computeMinimumValueSizes(); + InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable(); R.buildExternalUses(); - R.computeMinimumValueSizes(); - InstructionCost Cost = R.getTreeCost(); + InstructionCost Cost = R.getTreeCost(TreeCost); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -24311,6 +24561,9 @@ class HorizontalReduction { } } V.transformNodes(); + V.computeMinimumValueSizes(); + InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL); + SmallPtrSet VLScalars(llvm::from_range, VL); // Gather externally used values. SmallPtrSet Visited; @@ -24342,12 +24595,10 @@ class HorizontalReduction { LocalExternallyUsedValues.insert(RdxVal); V.buildExternalUses(LocalExternallyUsedValues); - V.computeMinimumValueSizes(); - // Estimate cost. InstructionCost ReductionCost = getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); - InstructionCost Cost = V.getTreeCost(VL, ReductionCost); + InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); if (!Cost.isValid()) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 07125b43e0575..541f2cbe29702 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -272,14 +272,13 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B]], i64 0 +; CHECK-NEXT: [[C:%.*]] = fsub double [[FNEG]], [[C1:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = fsub double [[C1]], [[B]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ADD]], i64 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP3]], [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D) ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <2 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll index e9cf1deac8eed..a71afc36a205e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll @@ -527,21 +527,14 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3 ; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 ; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 ; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], [[X0]] -; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], [[X1]] -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] +; NO-SVE-NEXT: [[TMP8:%.*]] = sdiv i32 [[A1]], [[X1]] +; NO-SVE-NEXT: [[TMP7:%.*]] = sdiv i32 [[A0]], [[X0]] ; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 ; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 -; NO-SVE-NEXT: ret <2 x i32> [[RES1]] +; NO-SVE-NEXT: [[TMP5:%.*]] = add <2 x i32> [[RES1]], [[X]] +; NO-SVE-NEXT: [[TMP6:%.*]] = sub <2 x i32> [[TMP5]], [[Y]] +; NO-SVE-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[Z]] +; NO-SVE-NEXT: ret <2 x i32> [[TMP9]] ; ; SVE-LABEL: define <2 x i32> @sdiv_v2i32_unknown_divisor( ; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { @@ -610,22 +603,13 @@ define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const(<2 x i32> %a, <2 x i32> %x, < ; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] { ; NO-SVE-NEXT: [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0 ; NO-SVE-NEXT: [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1 -; NO-SVE-NEXT: [[TMP1:%.*]] = sdiv i32 [[A0]], [[A0]] ; NO-SVE-NEXT: [[TMP2:%.*]] = sdiv i32 [[A1]], 4 -; NO-SVE-NEXT: [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0 -; NO-SVE-NEXT: [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1 -; NO-SVE-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]] -; NO-SVE-NEXT: [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]] -; NO-SVE-NEXT: [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0 -; NO-SVE-NEXT: [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1 -; NO-SVE-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]] -; NO-SVE-NEXT: [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]] -; NO-SVE-NEXT: [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0 -; NO-SVE-NEXT: [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1 -; NO-SVE-NEXT: [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]] -; NO-SVE-NEXT: [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]] +; NO-SVE-NEXT: [[TMP7:%.*]] = sdiv i32 [[A0]], [[A0]] ; NO-SVE-NEXT: [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; NO-SVE-NEXT: [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1 +; NO-SVE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP2]], i32 1 +; NO-SVE-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[X]] +; NO-SVE-NEXT: [[TMP6:%.*]] = sub <2 x i32> [[TMP5]], [[Y]] +; NO-SVE-NEXT: [[RES1:%.*]] = mul <2 x i32> [[TMP6]], [[Z]] ; NO-SVE-NEXT: ret <2 x i32> [[RES1]] ; ; SVE-LABEL: define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll index f397290299a4f..0ac3323e0a7b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll @@ -8,74 +8,40 @@ define void @h(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i ; CHECK-NEXT: [[CONV9:%.*]] = zext i16 [[A]] to i32 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 ; CHECK-NEXT: [[CONV310:%.*]] = zext i16 [[B]] to i32 -; CHECK-NEXT: [[ADD4:%.*]] = or i32 [[CONV310]], [[CONV9]] -; CHECK-NEXT: [[SUB:%.*]] = or i32 [[CONV9]], [[CONV310]] -; CHECK-NEXT: [[CONV15:%.*]] = sext i16 [[C]] to i32 -; CHECK-NEXT: [[SHR:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24 -; CHECK-NEXT: [[CONV19:%.*]] = sext i16 [[D]] to i32 -; CHECK-NEXT: [[SUB20:%.*]] = or i32 [[SHR]], [[CONV19]] -; CHECK-NEXT: [[SHR29:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ADD30:%.*]] = or i32 [[SHR29]], [[CONV15]] -; CHECK-NEXT: [[SUB39:%.*]] = or i32 [[SUB]], [[SUB20]] -; CHECK-NEXT: [[CONV40:%.*]] = trunc i32 [[SUB39]] to i16 -; CHECK-NEXT: store i16 [[CONV40]], ptr [[ARRAYIDX2]], align 2 -; CHECK-NEXT: [[SUB44:%.*]] = or i32 [[ADD4]], [[ADD30]] -; CHECK-NEXT: [[CONV45:%.*]] = trunc i32 [[SUB44]] to i16 -; CHECK-NEXT: store i16 [[CONV45]], ptr [[ARRAYIDX18]], align 2 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr i8, ptr null, i64 18 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[D]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[G]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[K]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[O]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[C]], i32 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[F]], i32 5 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[J]], i32 6 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[N]], i32 7 ; CHECK-NEXT: [[CONV3_112:%.*]] = zext i16 [[E]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[H]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[L]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[I]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[M]], i32 1 ; CHECK-NEXT: [[ADD4_1:%.*]] = or i32 [[CONV3_112]], 0 -; CHECK-NEXT: [[SUB_1:%.*]] = or i32 0, [[CONV3_112]] -; CHECK-NEXT: [[CONV15_1:%.*]] = sext i16 [[F]] to i32 -; CHECK-NEXT: [[SHR_1:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ARRAYIDX18_1:%.*]] = getelementptr i8, ptr null, i64 26 -; CHECK-NEXT: [[CONV19_1:%.*]] = sext i16 [[G]] to i32 -; CHECK-NEXT: [[SUB20_1:%.*]] = or i32 [[SHR_1]], [[CONV19_1]] -; CHECK-NEXT: [[SHR29_1:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ADD30_1:%.*]] = or i32 [[SHR29_1]], [[CONV15_1]] -; CHECK-NEXT: [[SUB39_1:%.*]] = or i32 [[SUB_1]], [[SUB20_1]] -; CHECK-NEXT: [[CONV40_1:%.*]] = trunc i32 [[SUB39_1]] to i16 -; CHECK-NEXT: store i16 [[CONV40_1]], ptr [[ARRAYIDX2_1]], align 2 -; CHECK-NEXT: [[SUB44_1:%.*]] = or i32 [[ADD4_1]], [[ADD30_1]] -; CHECK-NEXT: [[CONV45_1:%.*]] = trunc i32 [[SUB44_1]] to i16 -; CHECK-NEXT: store i16 [[CONV45_1]], ptr [[ARRAYIDX18_1]], align 2 -; CHECK-NEXT: [[CONV_213:%.*]] = zext i16 [[H]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr null, i64 20 -; CHECK-NEXT: [[CONV3_214:%.*]] = zext i16 [[I]] to i32 -; CHECK-NEXT: [[ADD4_2:%.*]] = or i32 0, [[CONV_213]] -; CHECK-NEXT: [[SUB_2:%.*]] = or i32 0, [[CONV3_214]] -; CHECK-NEXT: [[CONV15_2:%.*]] = sext i16 [[J]] to i32 -; CHECK-NEXT: [[SHR_2:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ARRAYIDX18_2:%.*]] = getelementptr i8, ptr null, i64 28 -; CHECK-NEXT: [[CONV19_2:%.*]] = sext i16 [[K]] to i32 -; CHECK-NEXT: [[SUB20_2:%.*]] = or i32 [[SHR_2]], [[CONV19_2]] -; CHECK-NEXT: [[SHR29_2:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ADD30_2:%.*]] = or i32 [[SHR29_2]], [[CONV15_2]] -; CHECK-NEXT: [[SUB39_2:%.*]] = or i32 [[SUB_2]], [[SUB20_2]] +; CHECK-NEXT: [[SUB39_3:%.*]] = or i32 [[CONV310]], [[CONV9]] +; CHECK-NEXT: [[SUB44_2:%.*]] = or i32 0, [[CONV3_112]] +; CHECK-NEXT: [[SUB39_2:%.*]] = or i32 [[CONV9]], [[CONV310]] +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i16> zeroinitializer, [[TMP7]] ; CHECK-NEXT: [[CONV40_2:%.*]] = trunc i32 [[SUB39_2]] to i16 -; CHECK-NEXT: store i16 [[CONV40_2]], ptr [[ARRAYIDX2_2]], align 2 -; CHECK-NEXT: [[SUB44_2:%.*]] = or i32 [[ADD4_2]], [[ADD30_2]] +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> poison, i16 [[CONV40_2]], i32 0 ; CHECK-NEXT: [[CONV45_2:%.*]] = trunc i32 [[SUB44_2]] to i16 -; CHECK-NEXT: store i16 [[CONV45_2]], ptr [[ARRAYIDX18_2]], align 2 -; CHECK-NEXT: [[CONV_315:%.*]] = zext i16 [[L]] to i32 -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr null, i64 22 -; CHECK-NEXT: [[CONV3_316:%.*]] = zext i16 [[M]] to i32 -; CHECK-NEXT: [[ADD4_3:%.*]] = or i32 0, [[CONV_315]] -; CHECK-NEXT: [[SUB_3:%.*]] = or i32 0, [[CONV3_316]] -; CHECK-NEXT: [[CONV15_3:%.*]] = sext i16 [[N]] to i32 -; CHECK-NEXT: [[SHR_3:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ARRAYIDX18_3:%.*]] = getelementptr i8, ptr null, i64 30 -; CHECK-NEXT: [[CONV19_3:%.*]] = sext i16 [[O]] to i32 -; CHECK-NEXT: [[SUB20_3:%.*]] = or i32 [[SHR_3]], [[CONV19_3]] -; CHECK-NEXT: [[SHR29_3:%.*]] = ashr i32 0, 0 -; CHECK-NEXT: [[ADD30_3:%.*]] = or i32 [[SHR29_3]], [[CONV15_3]] -; CHECK-NEXT: [[SUB39_3:%.*]] = or i32 [[SUB_3]], [[SUB20_3]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[CONV45_2]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = or <2 x i16> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP16]], <8 x i16> [[TMP18]], <8 x i32> ; CHECK-NEXT: [[CONV40_3:%.*]] = trunc i32 [[SUB39_3]] to i16 -; CHECK-NEXT: store i16 [[CONV40_3]], ptr [[ARRAYIDX2_3]], align 2 -; CHECK-NEXT: [[SUB44_3:%.*]] = or i32 [[ADD4_3]], [[ADD30_3]] -; CHECK-NEXT: [[CONV45_3:%.*]] = trunc i32 [[SUB44_3]] to i16 -; CHECK-NEXT: store i16 [[CONV45_3]], ptr [[ARRAYIDX18_3]], align 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[CONV40_3]], i32 4 +; CHECK-NEXT: [[TMP22:%.*]] = trunc i32 [[ADD4_1]] to i16 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x i16> [[TMP21]], i16 [[TMP22]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = or <2 x i16> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP24]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i16> [[TMP23]], <8 x i16> [[TMP25]], <8 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = or <8 x i16> [[TMP26]], [[TMP12]] +; CHECK-NEXT: store <8 x i16> [[TMP27]], ptr [[ARRAYIDX2]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 26ce0fc6e6a3b..ea2e27599161d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -40,26 +40,28 @@ define void @test() { ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[I68]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[I66]], i32 3 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[I67]], i32 6 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I69]], i32 7 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[I66]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> [[TMP0]], <16 x i32> ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: ; CHECK-NEXT: [[TMP22:%.*]] = phi <8 x float> [ [[TMP14]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP23:%.*]] = phi <8 x float> [ [[TMP19]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = phi <2 x float> [ [[TMP16]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ] ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> [[TMP23]], <8 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP40]], <8 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x float> [[TMP41]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP21]] ; CHECK-NEXT: [[TMP27:%.*]] = fmul fast <16 x float> [[TMP25]], [[TMP0]] ; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison ; CHECK-NEXT: [[TMP30:%.*]] = fadd fast <16 x float> [[TMP29]], poison ; CHECK-NEXT: [[TMP31]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP32]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP37]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <2 x i32> ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: ; CHECK-NEXT: [[TMP35:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP30]], %[[BB78]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 5ebe44206c702..a31cd4301524d 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -80,21 +80,23 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]] ; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> -; CHECK-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) ; CHECK-NEXT: [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32> ; CHECK-NEXT: [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> ; CHECK-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] ; CHECK-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0 -; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP117]], <4 x i32> -; CHECK-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> +; CHECK-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP69:%.*]] = insertelement <2 x i8> poison, i8 [[TMP115]], i32 0 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <2 x i8> [[TMP69]], i8 [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32> ; CHECK-NEXT: [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP117:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32> +; CHECK-NEXT: [[TMP119:%.*]] = shufflevector <2 x i32> [[TMP117]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32> +; CHECK-NEXT: [[TMP121:%.*]] = shufflevector <2 x i32> [[TMP120]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = shufflevector <4 x i32> [[TMP119]], <4 x i32> [[TMP121]], <4 x i32> ; CHECK-NEXT: [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16) ; CHECK-NEXT: [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll index 085d7a64fc9ac..2b79ca9429fa3 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll @@ -9,15 +9,13 @@ define void @partial_vec_invalid_cost() #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LSHR_1:%.*]] = lshr i96 0, 0 ; CHECK-NEXT: [[LSHR_2:%.*]] = lshr i96 0, 0 -; CHECK-NEXT: [[TRUNC_I96_1:%.*]] = trunc i96 [[LSHR_1]] to i32 -; CHECK-NEXT: [[TRUNC_I96_2:%.*]] = trunc i96 [[LSHR_2]] to i32 -; CHECK-NEXT: [[TRUNC_I96_3:%.*]] = trunc i96 0 to i32 -; CHECK-NEXT: [[TRUNC_I96_4:%.*]] = trunc i96 0 to i32 -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer) -; CHECK-NEXT: [[OP_RDX:%.*]] = or i32 [[TMP1]], [[TRUNC_I96_1]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = or i32 [[TRUNC_I96_2]], [[TRUNC_I96_3]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = or i32 [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = or i32 [[OP_RDX2]], [[TRUNC_I96_4]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i96> poison, i96 [[LSHR_1]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i96> [[TMP0]], i96 [[LSHR_2]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i96> [[TMP1]], i96 0, i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i96> [[TMP2]], i96 0, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i96> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[RDX_OP]]) ; CHECK-NEXT: [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96 ; CHECK-NEXT: store i96 [[STORE_THIS]], ptr null, align 16 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll index d4e323819402c..aaf290ba952f2 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll @@ -101,81 +101,82 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; THRESH-NEXT: [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1 ; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312:%.*]] = extractelement <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], i64 0 ; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] -; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 -; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 ; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 ; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 ; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 -; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 +; THRESH-NEXT: [[ADD2235_I17:%.*]] = or i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 ; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] ; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 -; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 +; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 +; THRESH-NEXT: [[ADD2302_I1:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 +; THRESH-NEXT: [[SHR2303_I1:%.*]] = lshr i32 [[ADD2302_I1]], 1 ; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] +; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I1]] to i16 +; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) +; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) +; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> ; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 +; THRESH-NEXT: [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[CONV2206_I]], i32 0 +; THRESH-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> +; THRESH-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[CONV2304_I]], i32 3 +; THRESH-NEXT: store <4 x i16> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 +; THRESH-NEXT: [[ADD2190_I1:%.*]] = or i32 [[ADD1392_I]], 1 +; THRESH-NEXT: [[ADD2236_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] ; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 +; THRESH-NEXT: [[ADD2258_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 +; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[ADD111_I_I]], 1 +; THRESH-NEXT: [[SHR2325_I:%.*]] = add i32 [[ADD2190_I1]], [[TMP0]] ; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 -; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 -; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] +; THRESH-NEXT: [[SHR2343_I:%.*]] = add i32 [[ADD2235_I16]], [[TMP0]] ; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 -; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 -; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 ; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 -; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 -; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 ; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 -; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 +; THRESH-NEXT: [[CONV2326_I1:%.*]] = trunc i32 [[SHR2237_I]] to i16 ; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 -; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 -; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 -; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) -; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 -; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 -; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 -; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0 -; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1) -; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1) -; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> -; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 -; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 -; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; THRESH-NEXT: [[CONV2344_I1:%.*]] = trunc i32 [[SHR2259_I]] to i16 +; THRESH-NEXT: [[CONV2282_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 +; THRESH-NEXT: [[ADD2236_I1:%.*]] = add i32 [[ADD2235_I17]], 1 +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1 +; THRESH-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 +; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> zeroinitializer +; THRESH-NEXT: [[TMP17:%.*]] = or <2 x i32> [[TMP14]], [[TMP16]] +; THRESH-NEXT: [[ADD2157_I:%.*]] = add i32 [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312]], 1 +; THRESH-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2157_I]], i32 0 +; THRESH-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> +; THRESH-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD2236_I1]], i32 3 +; THRESH-NEXT: [[TMP22:%.*]] = lshr <4 x i32> [[TMP21]], splat (i32 1) +; THRESH-NEXT: [[TMP23:%.*]] = trunc <4 x i32> [[TMP22]] to <4 x i16> +; THRESH-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[TMP23]], i32 0 +; THRESH-NEXT: store i16 [[TMP24]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8230), align 2 +; THRESH-NEXT: store <4 x i16> [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 +; THRESH-NEXT: [[TMP25:%.*]] = insertelement <8 x i16> poison, i16 [[CONV2282_I]], i32 0 +; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <8 x i32> +; THRESH-NEXT: [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> +; THRESH-NEXT: [[TMP28:%.*]] = insertelement <8 x i16> [[TMP27]], i16 [[CONV2206_I]], i32 5 +; THRESH-NEXT: [[TMP29:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[CONV2326_I1]], i32 6 +; THRESH-NEXT: [[TMP30:%.*]] = insertelement <8 x i16> [[TMP29]], i16 [[CONV2326_I]], i32 7 +; THRESH-NEXT: store <8 x i16> [[TMP30]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 +; THRESH-NEXT: [[TMP31:%.*]] = insertelement <8 x i16> [[TMP27]], i16 [[CONV2344_I1]], i32 4 +; THRESH-NEXT: [[TMP32:%.*]] = insertelement <8 x i16> [[TMP31]], i16 [[CONV2344_I]], i32 5 +; THRESH-NEXT: [[TMP33:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[CONV2326_I1]], i32 6 +; THRESH-NEXT: [[TMP34:%.*]] = insertelement <8 x i16> [[TMP33]], i16 [[CONV2326_I]], i32 7 +; THRESH-NEXT: store <8 x i16> [[TMP34]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 ; THRESH-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll index 194c7021f60f5..c155702c62830 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,20 +4,18 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 0 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM:%.*]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[LOCAL_0_]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[LOCAL_4_]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], ; CHECK-NEXT: store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]] ; CHECK-NEXT: [[V44]] = add i32 [[LOCAL_4_]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index d02df1ac92b4d..9b45fe6a2804b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -282,23 +282,26 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]] +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R51:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index d9a7586ecd23d..d812cc813c20f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -282,23 +282,26 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; SSE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]] +; SSE-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R51:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6 ; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll index 7f7e77eadc987..57deca1d62516 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll @@ -607,25 +607,38 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) { ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] -; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> -; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> -; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 +; SLM-NEXT: [[C0:%.*]] = fdiv double [[A0]], [[B0]] +; SLM-NEXT: [[C1:%.*]] = fdiv double [[A1]], [[B1]] +; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]] +; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]] +; SLM-NEXT: [[C4:%.*]] = fdiv double [[A4]], [[B4]] +; SLM-NEXT: [[C5:%.*]] = fdiv double [[A5]], [[B5]] +; SLM-NEXT: [[C6:%.*]] = fdiv double [[A6]], [[B6]] +; SLM-NEXT: [[C7:%.*]] = fdiv double [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x double> poison, double [[C0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6 +; SLM-NEXT: [[R73:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7 ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll index 8b8bc71c2ceda..d1a5c3bb032e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll @@ -607,25 +607,38 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) { ; SSE-NEXT: ret <8 x double> [[TMP1]] ; ; SLM-LABEL: @buildvector_div_8f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> -; SLM-NEXT: [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]] -; SLM-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> -; SLM-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> -; SLM-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> -; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7 +; SLM-NEXT: [[C0:%.*]] = fdiv double [[A0]], [[B0]] +; SLM-NEXT: [[C1:%.*]] = fdiv double [[A1]], [[B1]] +; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]] +; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]] +; SLM-NEXT: [[C4:%.*]] = fdiv double [[A4]], [[B4]] +; SLM-NEXT: [[C5:%.*]] = fdiv double [[A5]], [[B5]] +; SLM-NEXT: [[C6:%.*]] = fdiv double [[A6]], [[B6]] +; SLM-NEXT: [[C7:%.*]] = fdiv double [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6 +; SLM-NEXT: [[R73:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7 ; SLM-NEXT: ret <8 x double> [[R73]] ; ; AVX-LABEL: @buildvector_div_8f64( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll index f8522bc546e6b..3c2472c2ab58d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll @@ -4,9 +4,15 @@ define void @b() { ; CHECK-LABEL: @b( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> poison, float 0x7FF8000000000000, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> , <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0x7FF8000000000000, i32 3 +; CHECK-NEXT: [[MUL:%.*]] = fmul float undef, 2.000000e+00 +; CHECK-NEXT: [[ADD:%.*]] = fadd float undef, 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[ADD]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[MUL]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fneg <2 x float> [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[MUL]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP9]], float [[ADD]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> zeroinitializer, <4 x float> zeroinitializer) ; CHECK-NEXT: [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = fdiv <4 x float> [[TMP4]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll index e3c134b068e04..16f31e3655de1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll @@ -5,21 +5,21 @@ define void @test(ptr %0, float %1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[TMP0:%.*]], float [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> , float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB8]] ], [ [[TMP5]], [[TMP2:%.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP4]], [[TMP2]] ] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK: [[BB5]]: +; CHECK-NEXT: [[TMP6:%.*]] = phi float [ [[TMP1]], %[[BB5]] ], [ [[TMP3]], [[TMP2:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi float [ [[TMP1]], %[[BB5]] ], [ 0.000000e+00, [[TMP2]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB5]] ], [ [[TMP5]], [[TMP2]] ] ; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP13]], ptr [[TMP0]], align 16 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP14]], zeroinitializer -; CHECK-NEXT: br label %[[BB8]] +; CHECK-NEXT: br label %[[BB5]] ; %3 = load float, ptr %0, align 4 br label %4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll index f1b094e9bbed4..d23e54f3495bd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2 %struct.ray = type { %struct.vec3, %struct.vec3 } %struct.vec3 = type { double, double, double } @@ -9,86 +9,246 @@ %struct.material = type { %struct.vec3, double, double } define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture noundef readonly byval(%struct.ray) align 8 %ray, ptr nocapture noundef readnone %sp) { -; CHECK-LABEL: @ray_sphere( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 -; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 -; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 -; CHECK-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) -; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) -; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 -; CHECK-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 -; CHECK-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 -; CHECK-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] -; CHECK-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) -; CHECK-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 -; CHECK-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 -; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 -; CHECK-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 -; CHECK-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 -; CHECK-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) -; CHECK-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] -; CHECK-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) -; CHECK-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) -; CHECK-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) -; CHECK-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) -; CHECK-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) -; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] -; CHECK-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] -; CHECK-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) -; CHECK-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] -; CHECK-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) -; CHECK-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) -; CHECK-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 -; CHECK-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 -; CHECK-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) -; CHECK-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 -; CHECK-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) -; CHECK-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 -; CHECK-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] -; CHECK: if.end: -; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) -; CHECK-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] -; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false -; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00) -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] -; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 -; CHECK-NEXT: br label [[CLEANUP]] -; CHECK: cleanup: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] -; CHECK-NEXT: ret i32 [[RETVAL_0]] +; SSE2-LABEL: @ray_sphere( +; SSE2-NEXT: entry: +; SSE2-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; SSE2-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; SSE2-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; SSE2-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; SSE2-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; SSE2-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; SSE2-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; SSE2-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; SSE2-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; SSE2-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; SSE2-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; SSE2-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; SSE2-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; SSE2-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; SSE2-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; SSE2-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; SSE2-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; SSE2-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; SSE2-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; SSE2-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; SSE2-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; SSE2-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; SSE2-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; SSE2-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; SSE2-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; SSE2-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; SSE2-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; SSE2-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; SSE2-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; SSE2-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; SSE2-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; SSE2-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; SSE2-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; SSE2-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; SSE2-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; SSE2-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; SSE2-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; SSE2-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; SSE2-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; SSE2-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; SSE2-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; SSE2-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; SSE2-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; SSE2-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; SSE2-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; SSE2-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; SSE2-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; SSE2-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; SSE2-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; SSE2-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; SSE2: if.end: +; SSE2-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; SSE2-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; SSE2-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; SSE2-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 +; SSE2-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 +; SSE2-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 +; SSE2-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] +; SSE2-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; SSE2-NEXT: [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer +; SSE2-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; SSE2-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 +; SSE2-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D +; SSE2-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 +; SSE2-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D +; SSE2-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; SSE2-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; SSE2: lor.lhs.false: +; SSE2-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00) +; SSE2-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 +; SSE2-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 +; SSE2-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] +; SSE2-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; SSE2-NEXT: br label [[CLEANUP]] +; SSE2: cleanup: +; SSE2-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; SSE2-NEXT: ret i32 [[RETVAL_0]] +; +; AVX-LABEL: @ray_sphere( +; AVX-NEXT: entry: +; AVX-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; AVX-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; AVX-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; AVX-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; AVX-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; AVX-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; AVX-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; AVX-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; AVX-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; AVX-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; AVX-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; AVX-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; AVX-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; AVX-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; AVX-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; AVX-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; AVX-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; AVX-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; AVX-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; AVX-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; AVX-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; AVX-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; AVX-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; AVX-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; AVX-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; AVX-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; AVX-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; AVX-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; AVX-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; AVX-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; AVX-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; AVX-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; AVX-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; AVX-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; AVX-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; AVX-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; AVX-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; AVX-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; AVX-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; AVX-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; AVX-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; AVX-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; AVX-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; AVX-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; AVX-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; AVX-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; AVX-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; AVX-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; AVX-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; AVX: if.end: +; AVX-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; AVX-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; AVX-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; AVX-NEXT: [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]] +; AVX-NEXT: [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]] +; AVX-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[SUB90]], i32 0 +; AVX-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[ADD]], i32 1 +; AVX-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP30:%.*]] = fdiv <2 x double> [[TMP27]], [[TMP29]] +; AVX-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[TMP30]], i32 1 +; AVX-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP31]], 0x3EB0C6F7A0B5ED8D +; AVX-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[TMP30]], i32 0 +; AVX-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP32]], 0x3EB0C6F7A0B5ED8D +; AVX-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; AVX-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; AVX: lor.lhs.false: +; AVX-NEXT: [[TMP33:%.*]] = fcmp ule <2 x double> [[TMP30]], splat (double 1.000000e+00) +; AVX-NEXT: [[TMP34:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0 +; AVX-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1 +; AVX-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP35]], i1 true, i1 [[TMP34]] +; AVX-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; AVX-NEXT: br label [[CLEANUP]] +; AVX: cleanup: +; AVX-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; AVX-NEXT: ret i32 [[RETVAL_0]] +; +; AVX2-LABEL: @ray_sphere( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1 +; AVX2-NEXT: [[TMP0:%.*]] = load double, ptr [[DIR]], align 8 +; AVX2-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1 +; AVX2-NEXT: [[TMP1:%.*]] = load double, ptr [[Y]], align 8 +; AVX2-NEXT: [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]] +; AVX2-NEXT: [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]]) +; AVX2-NEXT: [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2 +; AVX2-NEXT: [[TMP3:%.*]] = load double, ptr [[Z]], align 8 +; AVX2-NEXT: [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]]) +; AVX2-NEXT: [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00 +; AVX2-NEXT: [[TMP5:%.*]] = load double, ptr [[RAY]], align 8 +; AVX2-NEXT: [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8 +; AVX2-NEXT: [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]] +; AVX2-NEXT: [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00 +; AVX2-NEXT: [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1 +; AVX2-NEXT: [[TMP7:%.*]] = load double, ptr [[Y19]], align 8 +; AVX2-NEXT: [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1 +; AVX2-NEXT: [[TMP8:%.*]] = load double, ptr [[Y21]], align 8 +; AVX2-NEXT: [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]] +; AVX2-NEXT: [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]] +; AVX2-NEXT: [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]]) +; AVX2-NEXT: [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00 +; AVX2-NEXT: [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2 +; AVX2-NEXT: [[TMP10:%.*]] = load double, ptr [[Z28]], align 8 +; AVX2-NEXT: [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2 +; AVX2-NEXT: [[TMP11:%.*]] = load double, ptr [[Z30]], align 8 +; AVX2-NEXT: [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]] +; AVX2-NEXT: [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]]) +; AVX2-NEXT: [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]] +; AVX2-NEXT: [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]]) +; AVX2-NEXT: [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]]) +; AVX2-NEXT: [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]]) +; AVX2-NEXT: [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]]) +; AVX2-NEXT: [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]]) +; AVX2-NEXT: [[FNEG:%.*]] = fneg double [[TMP6]] +; AVX2-NEXT: [[TMP18:%.*]] = fneg double [[TMP8]] +; AVX2-NEXT: [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]] +; AVX2-NEXT: [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]]) +; AVX2-NEXT: [[NEG78:%.*]] = fneg double [[TMP11]] +; AVX2-NEXT: [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]]) +; AVX2-NEXT: [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]]) +; AVX2-NEXT: [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1 +; AVX2-NEXT: [[TMP22:%.*]] = load double, ptr [[RAD]], align 8 +; AVX2-NEXT: [[NEG82:%.*]] = fneg double [[TMP22]] +; AVX2-NEXT: [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]]) +; AVX2-NEXT: [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00 +; AVX2-NEXT: [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]] +; AVX2-NEXT: [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]]) +; AVX2-NEXT: [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00 +; AVX2-NEXT: br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]] +; AVX2: if.end: +; AVX2-NEXT: [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]]) +; AVX2-NEXT: [[FNEG87:%.*]] = fneg double [[TMP12]] +; AVX2-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 +; AVX2-NEXT: [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]] +; AVX2-NEXT: [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]] +; AVX2-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[SUB90]], i32 0 +; AVX2-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[ADD]], i32 1 +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 +; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> poison, <2 x i32> zeroinitializer +; AVX2-NEXT: [[TMP30:%.*]] = fdiv <2 x double> [[TMP27]], [[TMP29]] +; AVX2-NEXT: [[TMP31:%.*]] = extractelement <2 x double> [[TMP30]], i32 1 +; AVX2-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP31]], 0x3EB0C6F7A0B5ED8D +; AVX2-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[TMP30]], i32 0 +; AVX2-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP32]], 0x3EB0C6F7A0B5ED8D +; AVX2-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false +; AVX2-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] +; AVX2: lor.lhs.false: +; AVX2-NEXT: [[TMP33:%.*]] = fcmp ule <2 x double> [[TMP30]], splat (double 1.000000e+00) +; AVX2-NEXT: [[TMP34:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0 +; AVX2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1 +; AVX2-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP35]], i1 true, i1 [[TMP34]] +; AVX2-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 +; AVX2-NEXT: br label [[CLEANUP]] +; AVX2: cleanup: +; AVX2-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ] +; AVX2-NEXT: ret i32 [[RETVAL_0]] ; entry: %dir = getelementptr inbounds %struct.ray, ptr %ray, i64 0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll index 249b3f9329319..9fea27e4faeff 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll @@ -12,26 +12,26 @@ define void @test() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[DOTPRE_PRE:%.*]] = load float, ptr poison, align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[DOTPRE_PRE]], i32 0 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[BB2:%.*]] ] +; CHECK-NEXT: [[DOTPRE:%.*]] = phi float [ [[DOTPRE_PRE]], [[ENTRY:%.*]] ], [ [[I2:%.*]], [[BB2:%.*]] ] +; CHECK-NEXT: [[FOXTROT_0:%.*]] = phi float [ undef, [[ENTRY]] ], [ [[GULF_0:%.*]], [[BB2]] ] ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP9:%.*]], [[BB2]] ] +; CHECK-NEXT: [[I:%.*]] = phi float [ [[DOTPRE]], [[BB1]] ], [ [[I2]], [[BB2]] ] +; CHECK-NEXT: [[GULF_0]] = phi float [ [[FOXTROT_0]], [[BB1]] ], [ [[TMP6:%.*]], [[BB2]] ] ; CHECK-NEXT: [[I1:%.*]] = load float, ptr poison, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP0]], float [[GULF_0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[GULF_0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[I1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP6]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], [[TMP7]] ; CHECK-NEXT: tail call void @foo(float [[MUL]]) -; CHECK-NEXT: [[I2:%.*]] = load float, ptr poison, align 4 +; CHECK-NEXT: [[I2]] = load float, ptr poison, align 4 ; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00 -; CHECK-NEXT: [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[BB1]], label [[BB2]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll index bfb623ac5a9b9..ddcc29a8739d0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll @@ -5,27 +5,28 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[NEXP:%.*]], float [[TMP0:%.*]], i1 [[CMP:%.*]], float [[TMP1:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> , float [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 ; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] ; CHECK: [[IF_THEN]]: ; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[DIV_1_I_I:%.*]] = fmul float [[TMP4]], 0.000000e+00 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> , float [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[DIV_1_I_I]], i32 0 ; CHECK-NEXT: br label %[[IF_END]] ; CHECK: [[IF_END]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[TMP0]], %[[ENTRY]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ [[TMP1]], %[[ENTRY]] ] ; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ , %[[ENTRY]] ] ; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP10]], i32 3 ; CHECK-NEXT: [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[CALL25:%.*]] = load volatile ptr, ptr null, align 8 ; CHECK-NEXT: [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll index 3ac0d01cf9a2c..aca0d7e1c7d8c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll @@ -7,14 +7,17 @@ define i1 @test(i32 %g, i16 %d) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP0:%.*]] = and i16 [[D]], 1 ; CHECK-NEXT: [[XOR_I_I:%.*]] = xor i32 [[G]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> +; CHECK-NEXT: [[CONV1_I_I:%.*]] = trunc i32 [[XOR_I_I]] to i8 +; CHECK-NEXT: [[CONV1_1_I_I:%.*]] = trunc i32 [[G]] to i8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[CONV1_1_I_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[CONV1_I_I]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3) ; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[G]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[XOR_I_I]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll index 19eb7bf4dfc94..6656f34e415a3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll @@ -12,32 +12,34 @@ define void @test(ptr %0, i32 %1, i32 %2) { ; CHECK-NEXT: [[TMP10:%.*]] = lshr i32 [[TMP7]], 1 ; CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[ADD_NARROWED_I_I]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP18]], -1 -; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP28]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], splat (i32 -2) -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> , i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = or <2 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = xor <2 x i32> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[TMP5]], align 8 +; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr null) ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> , i32 [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP33:%.*]] = and <2 x i32> [[TMP17]], [[TMP32]] -; CHECK-NEXT: call void @llvm.stackrestore.p0(ptr null) +; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i32> [[TMP33]] to <2 x i64> ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i32> [[TMP34]] to <2 x i64> -; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i32> [[TMP33]] to <2 x i64> ; CHECK-NEXT: [[TMP35:%.*]] = shl <2 x i64> [[TMP23]], splat (i64 1) ; CHECK-NEXT: [[TMP25:%.*]] = or <2 x i64> [[TMP35]], [[TMP22]] ; CHECK-NEXT: [[TMP26:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = tail call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 0) -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[TMP3]], align 16 ; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x i32> [[TMP32]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP30:%.*]] = and <2 x i32> [[TMP29]], [[TMP26]] ; CHECK-NEXT: [[TMP31:%.*]] = or <2 x i32> [[TMP30]], [[TMP27]] -; CHECK-NEXT: store <2 x i32> [[TMP31]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP36:%.*]] = trunc i64 [[TMP19]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <2 x i32> poison, i32 [[TMP36]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <2 x i32> [[TMP37]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP39:%.*]] = and <2 x i32> [[TMP38]], splat (i32 -2) +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> , i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = or <2 x i32> [[TMP28]], [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = xor <2 x i32> [[TMP28]], [[TMP39]] +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <2 x i32> [[TMP40]], <2 x i32> [[TMP41]], <2 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <2 x i32> [[TMP31]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> [[TMP44]], <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP45]], ptr [[TMP3]], align 16 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll index af165de293005..a9e9ff14a5202 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll @@ -4,21 +4,22 @@ define void @test(float %0) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP2:%.*]] = fdiv float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = fdiv <2 x float> [[TMP6]], zeroinitializer -; CHECK-NEXT: br label %[[BB6:.*]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK: [[BB5]]: ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> zeroinitializer, [[TMP7]] -; CHECK-NEXT: br label %[[BB10:.*]] -; CHECK: [[BB9:.*]]: -; CHECK-NEXT: br label %[[BB10]] -; CHECK: [[BB10]]: -; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB6]] ], [ poison, %[[BB9]] ] -; CHECK-NEXT: br label %[[BB12:.*]] -; CHECK: [[BB12]]: +; CHECK-NEXT: br label %[[BB9:.*]] +; CHECK: [[BB8:.*]]: +; CHECK-NEXT: br label %[[BB9]] +; CHECK: [[BB9]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB5]] ], [ poison, %[[BB8]] ] +; CHECK-NEXT: br label %[[BB11:.*]] +; CHECK: [[BB11]]: +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll index 9c8ba07734b87..d3919f6883950 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 -slp-threshold=-1 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE42 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=CHECK,CHECK-AVX +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2 +; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) { ; CHECK-LABEL: define void @store_i32( @@ -102,20 +102,99 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) { } define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { -; CHECK-LABEL: define void @store_i64( -; CHECK-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255) -; CHECK-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295) -; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255) -; CHECK-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] -; CHECK-NEXT: ret void +; CHECK-SSE2-LABEL: define void @store_i64( +; CHECK-SSE2-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { +; CHECK-SSE2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-SSE2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] +; CHECK-SSE2-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; CHECK-SSE2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-SSE2-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-SSE2-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) +; CHECK-SSE2-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-SSE2-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255) +; CHECK-SSE2-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-SSE2-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255) +; CHECK-SSE2-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-SSE2-NEXT: ret void +; +; CHECK-SSE42-LABEL: define void @store_i64( +; CHECK-SSE42-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { +; CHECK-SSE42-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-SSE42-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] +; CHECK-SSE42-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; CHECK-SSE42-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-SSE42-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-SSE42-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) +; CHECK-SSE42-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-SSE42-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255) +; CHECK-SSE42-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-SSE42-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255) +; CHECK-SSE42-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-SSE42-NEXT: ret void +; +; CHECK-AVX-LABEL: define void @store_i64( +; CHECK-AVX-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { +; CHECK-AVX-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 1 +; CHECK-AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2 +; CHECK-AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 3 +; CHECK-AVX-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] +; CHECK-AVX-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX-NEXT: [[TMP10:%.*]] = mul i64 [[TMP8]], [[TMP4]] +; CHECK-AVX-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP4]] +; CHECK-AVX-NEXT: [[TMP12:%.*]] = lshr i64 [[TMP10]], 15 +; CHECK-AVX-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 15 +; CHECK-AVX-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP12]] to i32 +; CHECK-AVX-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP13]] to i32 +; CHECK-AVX-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP4]] +; CHECK-AVX-NEXT: [[TMP19:%.*]] = mul i64 [[TMP17]], [[TMP4]] +; CHECK-AVX-NEXT: [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP19]], i32 0 +; CHECK-AVX-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP18]], i32 1 +; CHECK-AVX-NEXT: [[TMP22:%.*]] = lshr <2 x i64> [[TMP21]], splat (i64 15) +; CHECK-AVX-NEXT: [[TMP23:%.*]] = trunc <2 x i64> [[TMP22]] to <2 x i32> +; CHECK-AVX-NEXT: [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP23]], <2 x i32> poison, <4 x i32> +; CHECK-AVX-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP15]], i32 2 +; CHECK-AVX-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP14]], i32 3 +; CHECK-AVX-NEXT: [[TMP27:%.*]] = icmp ult <4 x i32> [[TMP26]], splat (i32 255) +; CHECK-AVX-NEXT: [[TMP28:%.*]] = shufflevector <2 x i64> [[TMP22]], <2 x i64> poison, <4 x i32> +; CHECK-AVX-NEXT: [[TMP29:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP13]], i32 2 +; CHECK-AVX-NEXT: [[TMP30:%.*]] = insertelement <4 x i64> [[TMP29]], i64 [[TMP12]], i32 3 +; CHECK-AVX-NEXT: [[TMP31:%.*]] = and <4 x i64> [[TMP30]], splat (i64 4294967295) +; CHECK-AVX-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP27]], <4 x i64> [[TMP31]], <4 x i64> splat (i64 255) +; CHECK-AVX-NEXT: store <4 x i64> [[TMP32]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX-NEXT: ret void +; +; CHECK-AVX2-LABEL: define void @store_i64( +; CHECK-AVX2-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { +; CHECK-AVX2-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] +; CHECK-AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; CHECK-AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-AVX2-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-AVX2-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) +; CHECK-AVX2-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-AVX2-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255) +; CHECK-AVX2-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-AVX2-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255) +; CHECK-AVX2-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX2-NEXT: ret void +; +; CHECK-AVX512-LABEL: define void @store_i64( +; CHECK-AVX512-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] { +; CHECK-AVX512-NEXT: [[TMP4:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-AVX512-NEXT: [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]] +; CHECK-AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0 +; CHECK-AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-AVX512-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]] +; CHECK-AVX512-NEXT: [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15) +; CHECK-AVX512-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32> +; CHECK-AVX512-NEXT: [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255) +; CHECK-AVX512-NEXT: [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295) +; CHECK-AVX512-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255) +; CHECK-AVX512-NEXT: store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]] +; CHECK-AVX512-NEXT: ret void ; %4 = zext i32 %1 to i64 %5 = load i64, ptr %0, align 8, !tbaa !7 @@ -164,11 +243,43 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) { !7 = !{!8, !8, i64 0} !8 = !{!"long", !4, i64 0} ;. -; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} -; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} -; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} -; CHECK: [[META3]] = !{!"Simple C++ TBAA"} -; CHECK: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} -; CHECK: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} -; CHECK: [[META6]] = !{!"long", [[META2]], i64 0} +; CHECK-SSE2: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-SSE2: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK-SSE2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK-SSE2: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK-SSE2: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} +; CHECK-SSE2: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK-SSE2: [[META6]] = !{!"long", [[META2]], i64 0} +;. +; CHECK-SSE42: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-SSE42: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK-SSE42: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK-SSE42: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK-SSE42: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} +; CHECK-SSE42: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK-SSE42: [[META6]] = !{!"long", [[META2]], i64 0} +;. +; CHECK-AVX: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-AVX: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK-AVX: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK-AVX: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK-AVX: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} +; CHECK-AVX: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK-AVX: [[META6]] = !{!"long", [[META2]], i64 0} +;. +; CHECK-AVX2: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-AVX2: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK-AVX2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK-AVX2: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK-AVX2: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} +; CHECK-AVX2: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK-AVX2: [[META6]] = !{!"long", [[META2]], i64 0} +;. +; CHECK-AVX512: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK-AVX512: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK-AVX512: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK-AVX512: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK-AVX512: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0} +; CHECK-AVX512: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK-AVX512: [[META6]] = !{!"long", [[META2]], i64 0} ;. diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll index 29a8a229980e9..bd804c46ebabc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -87,11 +87,10 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[ADD:%.*]] = fsub double [[C:%.*]], [[B]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[FNEG]], [[C]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] @@ -136,11 +135,10 @@ define i1 @fcmp_lt(double %a, double %b, double %c) { ; CHECK-LABEL: @fcmp_lt( ; CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[ADD:%.*]] = fsub double [[C:%.*]], [[B]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[FNEG]], [[C]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[SUB]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[ADD]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll index 4a5dd2a63723e..a63b34cb3079c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll @@ -4,36 +4,32 @@ define i16 @test() { ; CHECK-LABEL: define i16 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 0, 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 0 ; CHECK-NEXT: [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0) +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 0, 0 ; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = shl i32 0, 0 -; CHECK-NEXT: [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i32 0, 0 +; CHECK-NEXT: [[TMP10:%.*]] = shl i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = and <8 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[CALL7_I45:%.*]] = tail call i32 null(i32 0) ; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = and i32 [[TMP8]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 0, 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> , i32 [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <24 x i32> , <24 x i32> [[TMP16]], <24 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> , <24 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <24 x i32> , <24 x i32> [[TMP16]], <24 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <24 x i32> [[TMP17]], i32 [[TMP9]], i32 13 ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> , <24 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> , <24 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP25]], <24 x i32> [[TMP36]], <24 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll index ef75a8dd99169..210f59688d59e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll @@ -4,26 +4,27 @@ define i32 @test(i32 %0, i1 %1) { ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: i32 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP0]] to double ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double> ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double> ; CHECK-NEXT: br i1 [[TMP1]], label %[[BB7:.*]], label %[[BB9:.*]] ; CHECK: [[BB7]]: ; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) -; CHECK-NEXT: br label %[[BB16:.*]] +; CHECK-NEXT: br label %[[BB17:.*]] ; CHECK: [[BB9]]: -; CHECK-NEXT: br i1 false, label %[[BB14:.*]], label %[[BB10:.*]] +; CHECK-NEXT: br i1 false, label %[[BB15:.*]], label %[[BB10:.*]] ; CHECK: [[BB10]]: +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> , <2 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> zeroinitializer) -; CHECK-NEXT: br label %[[BB14]] -; CHECK: [[BB14]]: -; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], %[[BB10]] ], [ zeroinitializer, %[[BB9]] ] -; CHECK-NEXT: br label %[[BB16]] -; CHECK: [[BB16]]: -; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x double> [ [[TMP15]], %[[BB14]] ], [ [[TMP8]], %[[BB7]] ] +; CHECK-NEXT: br label %[[BB15]] +; CHECK: [[BB15]]: +; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x double> [ [[TMP13]], %[[BB10]] ], [ zeroinitializer, %[[BB9]] ] +; CHECK-NEXT: br label %[[BB17]] +; CHECK: [[BB17]]: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x double> [ [[TMP16]], %[[BB15]] ], [ [[TMP8]], %[[BB7]] ] ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[TMP17]], i32 0 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP17]], i32 1 ; CHECK-NEXT: [[TMP20:%.*]] = fmul double [[TMP19]], [[TMP18]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll index 0f9b2e9ba86fd..e67589426bcc6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 %s | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 %s | FileCheck %s --check-prefixes=SSE,SSE4 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 %s | FileCheck %s --check-prefixes=SSE +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 %s | FileCheck %s --check-prefixes=SSE ; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 %s | FileCheck %s --check-prefixes=AVX2 ; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 %s | FileCheck %s --check-prefixes=AVX512 @@ -95,52 +95,33 @@ define <4 x i64> @scalarize_ctlz_v4i64(<4 x i64> %v) { } define <8 x i64> @scalarize_ctlz_v8i64(<8 x i64> %v) { -; SSE2-LABEL: define <8 x i64> @scalarize_ctlz_v8i64( -; SSE2-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] { -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false) -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP3]], i1 false) -; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> -; SSE2-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP5]], i1 false) -; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> -; SSE2-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP7]], i1 false) -; SSE2-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> -; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <8 x i32> -; SSE2-NEXT: [[R31:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> -; SSE2-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> -; SSE2-NEXT: [[R52:%.*]] = shufflevector <8 x i64> [[R31]], <8 x i64> [[TMP11]], <8 x i32> -; SSE2-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <8 x i32> -; SSE2-NEXT: [[R73:%.*]] = shufflevector <8 x i64> [[R52]], <8 x i64> [[TMP12]], <8 x i32> -; SSE2-NEXT: ret <8 x i64> [[R73]] -; -; SSE4-LABEL: define <8 x i64> @scalarize_ctlz_v8i64( -; SSE4-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] { -; SSE4-NEXT: [[V0:%.*]] = extractelement <8 x i64> [[V]], i64 0 -; SSE4-NEXT: [[V1:%.*]] = extractelement <8 x i64> [[V]], i64 1 -; SSE4-NEXT: [[V2:%.*]] = extractelement <8 x i64> [[V]], i64 2 -; SSE4-NEXT: [[V3:%.*]] = extractelement <8 x i64> [[V]], i64 3 -; SSE4-NEXT: [[V4:%.*]] = extractelement <8 x i64> [[V]], i64 4 -; SSE4-NEXT: [[V5:%.*]] = extractelement <8 x i64> [[V]], i64 5 -; SSE4-NEXT: [[V6:%.*]] = extractelement <8 x i64> [[V]], i64 6 -; SSE4-NEXT: [[V7:%.*]] = extractelement <8 x i64> [[V]], i64 7 -; SSE4-NEXT: [[C0:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V0]], i1 false) -; SSE4-NEXT: [[C1:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V1]], i1 false) -; SSE4-NEXT: [[C2:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V2]], i1 false) -; SSE4-NEXT: [[C3:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V3]], i1 false) -; SSE4-NEXT: [[C4:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V4]], i1 false) -; SSE4-NEXT: [[C5:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V5]], i1 false) -; SSE4-NEXT: [[C6:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V6]], i1 false) -; SSE4-NEXT: [[C7:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V7]], i1 false) -; SSE4-NEXT: [[R0:%.*]] = insertelement <8 x i64> poison, i64 [[C0]], i64 0 -; SSE4-NEXT: [[R1:%.*]] = insertelement <8 x i64> [[R0]], i64 [[C1]], i64 1 -; SSE4-NEXT: [[R2:%.*]] = insertelement <8 x i64> [[R1]], i64 [[C2]], i64 2 -; SSE4-NEXT: [[R3:%.*]] = insertelement <8 x i64> [[R2]], i64 [[C3]], i64 3 -; SSE4-NEXT: [[R4:%.*]] = insertelement <8 x i64> [[R3]], i64 [[C4]], i64 4 -; SSE4-NEXT: [[R5:%.*]] = insertelement <8 x i64> [[R4]], i64 [[C5]], i64 5 -; SSE4-NEXT: [[R6:%.*]] = insertelement <8 x i64> [[R5]], i64 [[C6]], i64 6 -; SSE4-NEXT: [[R7:%.*]] = insertelement <8 x i64> [[R6]], i64 [[C7]], i64 7 -; SSE4-NEXT: ret <8 x i64> [[R7]] +; SSE-LABEL: define <8 x i64> @scalarize_ctlz_v8i64( +; SSE-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] { +; SSE-NEXT: [[V0:%.*]] = extractelement <8 x i64> [[V]], i64 0 +; SSE-NEXT: [[V1:%.*]] = extractelement <8 x i64> [[V]], i64 1 +; SSE-NEXT: [[V2:%.*]] = extractelement <8 x i64> [[V]], i64 2 +; SSE-NEXT: [[V3:%.*]] = extractelement <8 x i64> [[V]], i64 3 +; SSE-NEXT: [[V4:%.*]] = extractelement <8 x i64> [[V]], i64 4 +; SSE-NEXT: [[V5:%.*]] = extractelement <8 x i64> [[V]], i64 5 +; SSE-NEXT: [[V6:%.*]] = extractelement <8 x i64> [[V]], i64 6 +; SSE-NEXT: [[V7:%.*]] = extractelement <8 x i64> [[V]], i64 7 +; SSE-NEXT: [[C0:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V0]], i1 false) +; SSE-NEXT: [[C1:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V1]], i1 false) +; SSE-NEXT: [[C2:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V2]], i1 false) +; SSE-NEXT: [[C3:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V3]], i1 false) +; SSE-NEXT: [[C4:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V4]], i1 false) +; SSE-NEXT: [[C5:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V5]], i1 false) +; SSE-NEXT: [[C6:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V6]], i1 false) +; SSE-NEXT: [[C7:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V7]], i1 false) +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i64> poison, i64 [[C0]], i64 0 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i64> [[R0]], i64 [[C1]], i64 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i64> [[R1]], i64 [[C2]], i64 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i64> [[R2]], i64 [[C3]], i64 3 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i64> [[R3]], i64 [[C4]], i64 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i64> [[R4]], i64 [[C5]], i64 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i64> [[R5]], i64 [[C6]], i64 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i64> [[R6]], i64 [[C7]], i64 7 +; SSE-NEXT: ret <8 x i64> [[R7]] ; ; AVX2-LABEL: define <8 x i64> @scalarize_ctlz_v8i64( ; AVX2-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll index cfff11758a37a..2b3f00dc21769 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll @@ -6,11 +6,17 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { ; CHECK-SAME: i32 [[TMP0:%.*]], i8 [[TMP1:%.*]], i64 [[TMP2:%.*]], float [[TMP3:%.*]]) { ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> , <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = trunc <2 x i64> [[TMP10]] to <2 x i8> +; CHECK-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP2]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP52:%.*]] = add i64 [[TMP10]], 1 +; CHECK-NEXT: [[TMP53:%.*]] = lshr i64 [[TMP9]], 16 +; CHECK-NEXT: [[TMP58:%.*]] = lshr i64 [[TMP52]], 1 +; CHECK-NEXT: [[TMP90:%.*]] = trunc i64 [[TMP53]] to i8 +; CHECK-NEXT: [[TMP91:%.*]] = trunc i64 [[TMP58]] to i8 +; CHECK-NEXT: [[TMP92:%.*]] = insertelement <2 x i8> poison, i8 [[TMP91]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i8> [[TMP92]], i8 [[TMP90]], i32 1 ; CHECK-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[TMP11]], <2 x i8> zeroinitializer) ; CHECK-NEXT: [[TMP13:%.*]] = uitofp <2 x i8> [[TMP12]] to <2 x float> ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 @@ -25,7 +31,7 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { ; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> zeroinitializer, [[TMP22]] ; CHECK-NEXT: [[TMP24:%.*]] = ashr <2 x i32> [[TMP23]], splat (i32 1) ; CHECK-NEXT: [[TMP25:%.*]] = sitofp <2 x i32> [[TMP24]] to <2 x float> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP51]], <8 x i32> @@ -51,12 +57,12 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { ; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 0, i64 8388608 ; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i1> [[TMP32]], i32 1 ; CHECK-NEXT: [[TMP50:%.*]] = select i1 [[TMP49]], i64 0, i64 32768 -; CHECK-NEXT: br label %[[BB53:.*]] -; CHECK: [[BB52:.*]]: +; CHECK-NEXT: br label %[[BB59:.*]] +; CHECK: [[BB58:.*]]: ; CHECK-NEXT: unreachable -; CHECK: [[BB53]]: -; CHECK-NEXT: br label %[[BB54:.*]] -; CHECK: [[BB54]]: +; CHECK: [[BB59]]: +; CHECK-NEXT: br label %[[BB60:.*]] +; CHECK: [[BB60]]: ; CHECK-NEXT: [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]]) ; CHECK-NEXT: [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]]) ; CHECK-NEXT: [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 @@ -96,7 +102,7 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { ; CHECK-NEXT: [[TMP85:%.*]] = or i64 [[TMP84]], [[TMP48]] ; CHECK-NEXT: [[TMP86:%.*]] = or i64 [[TMP85]], [[TMP81]] ; CHECK-NEXT: store i64 [[TMP86]], ptr null, align 1 -; CHECK-NEXT: br label %[[BB52]] +; CHECK-NEXT: br label %[[BB58]] ; %5 = and i64 %2, 255 %6 = and i64 %2, -65536 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll index 3bafc3c6552f2..252746b465bc6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll @@ -5,14 +5,20 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) { ; CHECK-LABEL: define i1 @test( ; CHECK-SAME: i64 [[V1:%.*]], ptr [[V2:%.*]], i32 [[V3:%.*]], i1 [[V4:%.*]]) { ; CHECK-NEXT: [[NEWFUNCROOT:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i8> -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i8> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[V1]], 40 +; CHECK-NEXT: [[TT3:%.*]] = lshr i64 [[V1]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TT3]] to i32 +; CHECK-NEXT: [[TT2:%.*]] = and i32 [[TMP1]], 255 +; CHECK-NEXT: [[TT1:%.*]] = and i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TT1]] to i8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> poison, i8 [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP33:%.*]] = trunc i32 [[TT2]] to i8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8> [[TMP7]], i8 [[TMP33]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TT1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TT2]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0 ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll index ffeb8045dea7e..6623e9b8ecc84 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll @@ -7,8 +7,15 @@ define <4 x float> @test(i64 %0) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> , i64 [[TMP0]], i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float> +; CHECK-NEXT: [[TMP11:%.*]] = sitofp i64 0 to float +; CHECK-NEXT: [[TMP12:%.*]] = sitofp i64 0 to float +; CHECK-NEXT: [[TMP13:%.*]] = sitofp i64 [[TMP0]] to float +; CHECK-NEXT: [[TMP14:%.*]] = sitofp i64 0 to float ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP14]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt <4 x float> [[TMP6]], zeroinitializer ; CHECK-NEXT: ret <4 x float> [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 4d1f6a1aa074b..5129411196e03 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -446,12 +446,11 @@ define void @reuse_shuffle_indices_cost_crash_3(ptr %m, double %conv, double %co ; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB19:%.*]] = fsub double 0.000000e+00, [[CONV2]] -; CHECK-NEXT: [[CONV20:%.*]] = fptrunc double [[SUB19]] to float -; CHECK-NEXT: store float [[CONV20]], ptr [[M]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 0.000000e+00 -; CHECK-NEXT: [[CONV239:%.*]] = fptrunc double [[ADD]] to float -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1 -; CHECK-NEXT: store float [[CONV239]], ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB19]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float> +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[M]], align 4 ; CHECK-NEXT: [[ADD26:%.*]] = fsub double [[CONV]], [[CONV]] ; CHECK-NEXT: [[CONV27:%.*]] = fptrunc double [[ADD26]] to float ; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2 @@ -522,12 +521,11 @@ define void @common_mask(ptr %m, double %conv, double %conv2) { ; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SUB19:%.*]] = fsub double [[CONV]], [[CONV]] -; CHECK-NEXT: [[CONV20:%.*]] = fptrunc double [[SUB19]] to float -; CHECK-NEXT: store float [[CONV20]], ptr [[M]], align 4 ; CHECK-NEXT: [[ADD:%.*]] = fadd double [[CONV2]], 0.000000e+00 -; CHECK-NEXT: [[CONV239:%.*]] = fptrunc double [[ADD]] to float -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1 -; CHECK-NEXT: store float [[CONV239]], ptr [[ARRAYIDX25]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB19]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float> +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[M]], align 4 ; CHECK-NEXT: [[ADD26:%.*]] = fsub double 0.000000e+00, [[CONV]] ; CHECK-NEXT: [[CONV27:%.*]] = fptrunc double [[ADD26]] to float ; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll b/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll index c1ec9b8eeadff..840767ac511a5 100644 --- a/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll +++ b/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll @@ -1,21 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,X86 %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,AARCH64 %} ; Vectorization tree roots at vector build sequence (insertelement), ; SLP crashed on generating vector code for pair {%i4, 0.0} trying to produce ; a shuffle with %ins1 as a source because it was marked deleted ; due to vectorization. define void @test() { -; CHECK-LABEL: define void @test() { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer -; CHECK-NEXT: [[TMP3]] = shufflevector <2 x float> [[TMP2]], <2 x float> , <2 x i32> -; CHECK-NEXT: br label [[LOOP]] +; X86-LABEL: define void @test() { +; X86-NEXT: entry: +; X86-NEXT: br label [[LOOP:%.*]] +; X86: loop: +; X86-NEXT: [[PH0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[I4:%.*]], [[LOOP]] ] +; X86-NEXT: [[PH1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[LOOP]] ] +; X86-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[PH0]], i32 0 +; X86-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[PH1]], i32 1 +; X86-NEXT: [[TMP2:%.*]] = fadd <2 x float> zeroinitializer, [[TMP1]] +; X86-NEXT: [[TMP3:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP2]], <2 x float> zeroinitializer +; X86-NEXT: [[I4]] = extractelement <2 x float> [[TMP3]], i64 0 +; X86-NEXT: br label [[LOOP]] +; +; AARCH64-LABEL: define void @test() { +; AARCH64-NEXT: entry: +; AARCH64-NEXT: br label [[LOOP:%.*]] +; AARCH64: loop: +; AARCH64-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ] +; AARCH64-NEXT: [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]] +; AARCH64-NEXT: [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer +; AARCH64-NEXT: [[TMP3]] = shufflevector <2 x float> [[TMP2]], <2 x float> , <2 x i32> +; AARCH64-NEXT: br label [[LOOP]] ; entry: br label %loop diff --git a/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll index f0f8377d637f9..13d47e5c11181 100644 --- a/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll @@ -1,23 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %} define void @foo(ptr %c, ptr %d) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 1 -; CHECK-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[ARRAYIDX4]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float> -; CHECK-NEXT: [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP9]], ptr [[ADD_PTR53]], align 4 -; CHECK-NEXT: ret void +; X86-LABEL: @foo( +; X86-NEXT: entry: +; X86-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 4 +; X86-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 1 +; X86-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 2 +; X86-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4 +; X86-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1 +; X86-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 +; X86-NEXT: [[CONV5:%.*]] = zext i8 [[TMP0]] to i32 +; X86-NEXT: [[CONV2:%.*]] = zext i8 [[TMP1]] to i32 +; X86-NEXT: [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 2 +; X86-NEXT: [[AND:%.*]] = and i32 [[CONV2]], 3 +; X86-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[ARRAYIDX12]], align 1 +; X86-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i16> +; X86-NEXT: [[TMP4:%.*]] = shl <2 x i16> [[TMP3]], splat (i16 2) +; X86-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[SHL6]], i32 0 +; X86-NEXT: [[TMP6:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32> +; X86-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <4 x i32> +; X86-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[AND]], i32 3 +; X86-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> undef, [[TMP9]] +; X86-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x float> +; X86-NEXT: [[TMP12:%.*]] = fdiv <4 x float> [[TMP11]], undef +; X86-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> +; X86-NEXT: store <4 x float> [[TMP13]], ptr [[ADD_PTR53]], align 4 +; X86-NEXT: ret void +; +; AARCH64-LABEL: @foo( +; AARCH64-NEXT: entry: +; AARCH64-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 1 +; AARCH64-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4 +; AARCH64-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[ARRAYIDX4]], align 1 +; AARCH64-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32> +; AARCH64-NEXT: [[TMP2:%.*]] = shl nuw nsw <4 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP3:%.*]] = and <4 x i32> [[TMP1]], +; AARCH64-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; AARCH64-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> undef, [[TMP4]] +; AARCH64-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float> +; AARCH64-NEXT: [[TMP7:%.*]] = fdiv <4 x float> [[TMP6]], undef +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; AARCH64-NEXT: store <4 x float> [[TMP8]], ptr [[ADD_PTR53]], align 4 +; AARCH64-NEXT: ret void ; entry: %arrayidx1 = getelementptr inbounds i8, ptr %c, i64 4 From 8e7ac4acc906e2ad9f5a73b4218f6d70a02431d7 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sun, 5 Oct 2025 07:55:43 -0700 Subject: [PATCH 2/2] Fix formatting Created using spr 1.3.7 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b633dd4d9fdb0..95e4c7781800d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16129,10 +16129,11 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable( LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n"); Cost = 0; - for (const auto &P : NodesCosts){ + for (const auto &P : NodesCosts) { Cost += P.second; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle " - << shortBundleName(P.first->Scalars, P.first->Idx) << ".\n" + << shortBundleName(P.first->Scalars, P.first->Idx) + << ".\n" << "SLP: Current total cost = " << Cost << "\n"); } return Cost; @@ -17897,7 +17898,7 @@ Value *BoUpSLP::gather( ArrayRef Entries = getTreeEntries(V); const auto *It = find_if(Entries, [&](const TreeEntry *E) { return !TransformedToGatherNodes.contains(E) && - !DeletedNodes.contains(E); + !DeletedNodes.contains(E); }); if (It != Entries.end()) { // Find which lane we need to extract.