From bfb909d754da8c02a4581a8b6f428efdfab8005e Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Sun, 5 Oct 2025 07:50:55 -0700
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 379 +++++++++++++++---
 .../PhaseOrdering/X86/vector-reductions.ll    |   9 +-
 .../Transforms/SLPVectorizer/AArch64/div.ll   |  38 +-
 ...ather-buildvector-with-minbitwidth-user.ll |  92 ++---
 .../AArch64/reused-scalar-repeated-in-node.ll |  18 +-
 .../SLPVectorizer/RISCV/complex-loads.ll      |  14 +-
 .../RISCV/partial-vec-invalid-cost.ll         |  16 +-
 .../RISCV/reordered-buildvector-scalars.ll    | 123 +++---
 .../Transforms/SLPVectorizer/X86/PR40310.ll   |  10 +-
 .../X86/alternate-int-inseltpoison.ll         |  17 +-
 .../SLPVectorizer/X86/alternate-int.ll        |  17 +-
 .../X86/arith-fp-inseltpoison.ll              |  51 ++-
 .../Transforms/SLPVectorizer/X86/arith-fp.ll  |  51 ++-
 .../SLPVectorizer/X86/buildvector-shuffle.ll  |  12 +-
 .../X86/buildvectors-parent-phi-nodes.ll      |  18 +-
 .../Transforms/SLPVectorizer/X86/c-ray.ll     | 326 +++++++++++----
 .../X86/delayed-gather-emission.ll            |  18 +-
 .../entry-no-bundle-but-extra-use-on-vec.ll   |  19 +-
 .../SLPVectorizer/X86/gather-with-cmp-user.ll |  13 +-
 .../original-inst-scheduled-after-copyable.ll |  26 +-
 .../SLPVectorizer/X86/phi-node-with-cycle.ll  |  23 +-
 .../Transforms/SLPVectorizer/X86/pr46983.ll   | 163 ++++++--
 .../SLPVectorizer/X86/reduction2.ll           |  18 +-
 .../X86/reschedule-only-scheduled.ll          |  32 +-
 ...same-last-instruction-different-parents.ll |  19 +-
 .../SLPVectorizer/X86/scalarize-ctlz.ll       |  77 ++--
 .../X86/split-node-reorder-node-with-ops.ll   |  30 +-
 .../subvector-minbitwidth-unsigned-value.ll   |  18 +-
 .../trunced-buildvector-scalar-extended.ll    |   9 +-
 .../X86/vec3-reorder-reshuffle.ll             |  18 +-
 .../gather_extract_from_vectorbuild.ll        |  35 +-
 .../vectorize-reorder-alt-shuffle.ll          |  61 ++-
 32 files changed, 1160 insertions(+), 610 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index fedca65d241e8..b633dd4d9fdb0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2005,9 +2005,15 @@ class BoUpSLP {
   /// holding live values over call sites.
   InstructionCost getSpillCost();
 
+  /// Calculates the cost of the subtrees, trims non-profitable ones and returns
+  /// final cost.
+  InstructionCost
+  calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
+
   /// \returns the vectorization cost of the subtree that starts at \p VL.
   /// A negative number means that this is profitable.
-  InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
+  InstructionCost getTreeCost(InstructionCost TreeCost,
+                              ArrayRef<Value *> VectorizedVals = {},
                               InstructionCost ReductionCost = TTI::TCC_Free);
 
   /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
@@ -2080,6 +2086,8 @@ class BoUpSLP {
   void deleteTree() {
     VectorizableTree.clear();
     ScalarToTreeEntries.clear();
+    DeletedNodes.clear();
+    TransformedToGatherNodes.clear();
     OperandsToTreeEntry.clear();
     ScalarsInSplitNodes.clear();
     MustGather.clear();
@@ -4511,6 +4519,13 @@ class BoUpSLP {
   /// Maps a specific scalar to its tree entry(ies).
   SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
 
+  /// List of deleted non-profitable nodes.
+  SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
+
+  /// List of nodes, transformed to gathered, with their conservative
+  /// gather/buildvector cost estimation.
+  SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
+
   /// Maps the operand index and entry to the corresponding tree entry.
   SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
       OperandsToTreeEntry;
@@ -8697,7 +8712,9 @@ void BoUpSLP::buildExternalUses(
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
+        DeletedNodes.contains(Entry) ||
+        TransformedToGatherNodes.contains(Entry))
       continue;
 
     // For each lane:
@@ -8744,7 +8761,11 @@ void BoUpSLP::buildExternalUses(
 
         // Skip in-tree scalars that become vectors
         if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
-            !UseEntries.empty()) {
+            !UseEntries.empty() &&
+            any_of(UseEntries, [this](const TreeEntry *UseEntry) {
+              return !DeletedNodes.contains(UseEntry) &&
+                     !TransformedToGatherNodes.contains(UseEntry);
+            })) {
           // Some in-tree scalars will remain as scalar in vectorized
           // instructions. If that is the case, the one in FoundLane will
           // be used.
@@ -8752,6 +8773,9 @@ void BoUpSLP::buildExternalUses(
                  isa<LoadInst, StoreInst>(UserInst)) ||
                 isa<CallInst>(UserInst)) ||
               all_of(UseEntries, [&](TreeEntry *UseEntry) {
+                if (DeletedNodes.contains(UseEntry) ||
+                    TransformedToGatherNodes.contains(UseEntry))
+                  return true;
                 return UseEntry->State == TreeEntry::ScatterVectorize ||
                        !doesInTreeUserNeedToExtract(
                            Scalar, getRootEntryInstruction(*UseEntry), TLI,
@@ -14208,7 +14232,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   unsigned EntryVF = E->getVectorFactor();
   auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
 
-  if (E->isGather()) {
+  if (E->isGather() || TransformedToGatherNodes.contains(E)) {
     if (allConstant(VL))
       return 0;
     if (isa<InsertElementInst>(VL[0]))
@@ -15892,26 +15916,16 @@ static T *performExtractsShuffleAction(
   return Prev;
 }
 
-namespace {
-/// Data type for handling buildvector sequences with the reused scalars from
-/// other tree entries.
-template <typename T> struct ShuffledInsertData {
-  /// List of insertelements to be replaced by shuffles.
-  SmallVector<InsertElementInst *> InsertElements;
-  /// The parent vectors and shuffle mask for the given list of inserts.
-  MapVector<T, SmallVector<int>> ValueMasks;
-};
-} // namespace
-
-InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
-                                     InstructionCost ReductionCost) {
-  InstructionCost Cost = ReductionCost;
+InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
+    ArrayRef<Value *> VectorizedVals) {
+  SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
+  SmallPtrSet<Value *, 4> CheckedExtracts;
+  SmallPtrSet<const TreeEntry *, 4> GatheredLoadsNodes;
   LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
                     << VectorizableTree.size() << ".\n");
-
-  SmallPtrSet<Value *, 4> CheckedExtracts;
-  for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
-    TreeEntry &TE = *VectorizableTree[I];
+  InstructionCost Cost = 0;
+  for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+    TreeEntry &TE = *Ptr;
     // No need to count the cost for combined entries, they are combined and
     // just skip their cost.
     if (TE.State == TreeEntry::CombinedVectorize) {
@@ -15919,6 +15933,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
           dbgs() << "SLP: Skipping cost for combined node that starts with "
                  << *TE.Scalars[0] << ".\n";
           TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+      NodesCosts.try_emplace(&TE);
       continue;
     }
     if (TE.hasState() &&
@@ -15931,6 +15946,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
         LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
                           << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                           << "SLP: Current total cost = " << Cost << "\n");
+        NodesCosts.try_emplace(&TE);
         continue;
       }
     }
@@ -15942,11 +15958,202 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
 
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
+    NodesCosts.try_emplace(&TE, C);
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
                       << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
+    // Add gathered loads nodes to the set for later processing.
+    if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
+        TE.getOpcode() == Instruction::Load)
+      GatheredLoadsNodes.insert(&TE);
+  }
+  // Bail out if the cost threshold is negative and cost already below it.
+  if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
+      Cost < -SLPCostThreshold)
+    return Cost;
+  // Bail out, if gathered loads nodes are found.
+  // TODO: add analysis for gathered load to include their cost correctly into
+  // the related subtrees.
+  if (!GatheredLoadsNodes.empty())
+    return Cost;
+  SmallVector<std::pair<InstructionCost, SmallVector<unsigned>>> SubtreeCosts(
+      VectorizableTree.size());
+  for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
+    TreeEntry &TE = *Ptr;
+    InstructionCost C = NodesCosts.at(&TE);
+    SubtreeCosts[TE.Idx].first += C;
+    const TreeEntry *UserTE = TE.UserTreeIndex.UserTE;
+    while (UserTE) {
+      SubtreeCosts[UserTE->Idx].first += C;
+      SubtreeCosts[UserTE->Idx].second.push_back(TE.Idx);
+      UserTE = UserTE->UserTreeIndex.UserTE;
+    }
+  }
+  using CostIndicesTy =
+      std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
+  struct FirstGreater {
+    bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
+      return LHS.second.first < RHS.second.first ||
+             (LHS.second.first == RHS.second.first &&
+              LHS.first->Idx < RHS.first->Idx);
+    }
+  };
+  PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
+      Worklist;
+  for (const auto [Idx, P] : enumerate(SubtreeCosts))
+    Worklist.emplace(VectorizableTree[Idx].get(), P);
+
+  // Narrow store trees with non-profitable immediate values - exit.
+  if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < 4 &&
+      VectorizableTree.front()->hasState() &&
+      VectorizableTree.front()->getOpcode() == Instruction::Store &&
+      (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
+    return Cost;
+
+  bool Changed = false;
+  while (!Worklist.empty() && Worklist.top().second.first > 0) {
+    TreeEntry *TE = Worklist.top().first;
+    if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(TE)) {
+      Worklist.pop();
+      continue;
+    }
+
+    // Calculate the gather cost of the root node.
+    InstructionCost SubtreeCost = Worklist.top().second.first;
+    if (SubtreeCost < TE->Scalars.size()) {
+      Worklist.pop();
+      continue;
+    }
+    if (!TransformedToGatherNodes.empty()) {
+      for (unsigned Idx : Worklist.top().second.second) {
+        auto It = TransformedToGatherNodes.find(VectorizableTree[Idx].get());
+        if (It != TransformedToGatherNodes.end()) {
+          SubtreeCost -= SubtreeCosts[Idx].first;
+          SubtreeCost += It->second;
+        }
+      }
+    }
+    if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
+      Worklist.pop();
+      continue;
+    }
+    const unsigned Sz = TE->Scalars.size();
+    APInt DemandedElts = APInt::getAllOnes(Sz);
+    for (auto [Idx, V] : enumerate(TE->Scalars)) {
+      if (isConstant(V))
+        DemandedElts.clearBit(Idx);
+    }
+    constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+    Type *ScalarTy = getValueType(TE->Scalars.front());
+    auto *VecTy = getWidenedType(ScalarTy, Sz);
+    const unsigned EntryVF = TE->getVectorFactor();
+    auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
+    InstructionCost GatherCost = ::getScalarizationOverhead(
+        *TTI, ScalarTy, VecTy, DemandedElts,
+        /*Insert=*/true, /*Extract=*/false, CostKind);
+    SmallVector<int> Mask;
+    if (!TE->ReorderIndices.empty() &&
+        TE->State != TreeEntry::CompressVectorize &&
+        (TE->State != TreeEntry::StridedVectorize ||
+         !isReverseOrder(TE->ReorderIndices))) {
+      SmallVector<int> NewMask;
+      if (TE->getOpcode() == Instruction::Store) {
+        // For stores the order is actually a mask.
+        NewMask.resize(TE->ReorderIndices.size());
+        copy(TE->ReorderIndices, NewMask.begin());
+      } else {
+        inversePermutation(TE->ReorderIndices, NewMask);
+      }
+      ::addMask(Mask, NewMask);
+    }
+    if (!TE->ReuseShuffleIndices.empty())
+      ::addMask(Mask, TE->ReuseShuffleIndices);
+    if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, EntryVF))
+      GatherCost +=
+          ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
+    // If all scalars are reused in gather node(s) or other vector nodes, there
+    // might be extra cost for inserting them.
+    if (all_of(TE->Scalars, [&](Value *V) {
+          return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
+                 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
+        }))
+      GatherCost *= 2;
+    // Erase subtree if it is non-profitable.
+    if (SubtreeCost > GatherCost) {
+      // If the remaining tree is just a buildvector - exit, it will cause
+      // enless attempts to vectorize.
+      if (VectorizableTree.front()->hasState() &&
+          VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
+          TE->Idx == 1)
+        return InstructionCost::getInvalid();
+
+      LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
+                        << TE->Idx << " with cost "
+                        << Worklist.top().second.first << " and gather cost "
+                        << GatherCost << ".\n");
+      if (TE->UserTreeIndex) {
+        TransformedToGatherNodes.try_emplace(TE, GatherCost);
+        NodesCosts.erase(TE);
+      } else {
+        DeletedNodes.insert(TE);
+        TransformedToGatherNodes.erase(TE);
+        NodesCosts.erase(TE);
+      }
+      for (unsigned Idx : Worklist.top().second.second) {
+        TreeEntry &ChildTE = *VectorizableTree[Idx];
+        DeletedNodes.insert(&ChildTE);
+        TransformedToGatherNodes.erase(&ChildTE);
+        NodesCosts.erase(&ChildTE);
+      }
+      Changed = true;
+    }
+    Worklist.pop();
+  }
+  if (!Changed)
+    return SubtreeCosts.front().first;
+
+  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (DeletedNodes.contains(TE.get()))
+      continue;
+    if (TransformedToGatherNodes.contains(TE.get()) && !TE->UserTreeIndex) {
+      assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
+      continue;
+    }
+    if (!NodesCosts.contains(TE.get())) {
+      InstructionCost C =
+          getEntryCost(TE.get(), VectorizedVals, CheckedExtracts);
+      NodesCosts.try_emplace(TE.get(), C);
+    }
   }
 
+  LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
+  Cost = 0;
+  for (const auto &P : NodesCosts){
+    Cost += P.second;
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
+                      << shortBundleName(P.first->Scalars, P.first->Idx) << ".\n"
+                      << "SLP: Current total cost = " << Cost << "\n");
+  }
+  return Cost;
+}
+
+namespace {
+/// Data type for handling buildvector sequences with the reused scalars from
+/// other tree entries.
+template <typename T> struct ShuffledInsertData {
+  /// List of insertelements to be replaced by shuffles.
+  SmallVector<InsertElementInst *> InsertElements;
+  /// The parent vectors and shuffle mask for the given list of inserts.
+  MapVector<T, SmallVector<int>> ValueMasks;
+};
+} // namespace
+
+InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
+                                     ArrayRef<Value *> VectorizedVals,
+                                     InstructionCost ReductionCost) {
+  InstructionCost Cost = TreeCost + ReductionCost;
+
   if (Cost >= -SLPCostThreshold &&
       none_of(ExternalUses, [](const ExternalUser &EU) {
         return isa_and_nonnull<InsertElementInst>(EU.User);
@@ -16243,8 +16450,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
   for (Value *V : ScalarOpsFromCasts) {
     ExternalUsesAsOriginalScalar.insert(V);
     if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
-      ExternalUses.emplace_back(V, nullptr, *TEs.front(),
-                                TEs.front()->findLaneForValue(V));
+      const auto *It = find_if_not(TEs, [&](TreeEntry *TE) {
+        return TransformedToGatherNodes.contains(TE) ||
+               DeletedNodes.contains(TE);
+      });
+      if (It != TEs.end()) {
+        const TreeEntry *UserTE = *It;
+        ExternalUses.emplace_back(V, nullptr, *UserTE,
+                                  UserTE->findLaneForValue(V));
+      }
     }
   }
   // Add reduced value cost, if resized.
@@ -16710,8 +16924,22 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       continue;
     // Build a list of tree entries where V is used.
     SmallPtrSet<const TreeEntry *, 4> VToTEs;
-    for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(V)) {
-      if (TEPtr == TE || TEPtr->Idx == 0)
+    SmallVector<const TreeEntry *> GatherNodes(
+        ValueToGatherNodes.lookup(V).takeVector());
+    if (TransformedToGatherNodes.contains(TE)) {
+      for (TreeEntry *E : getSplitTreeEntries(V)) {
+        if (TE == E || !TransformedToGatherNodes.contains(E))
+          continue;
+        GatherNodes.push_back(E);
+      }
+      for (TreeEntry *E : getTreeEntries(V)) {
+        if (TE == E || !TransformedToGatherNodes.contains(E))
+          continue;
+        GatherNodes.push_back(E);
+      }
+    }
+    for (const TreeEntry *TEPtr : GatherNodes) {
+      if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(TEPtr))
         continue;
       assert(any_of(TEPtr->Scalars,
                     [&](Value *V) { return GatheredScalars.contains(V); }) &&
@@ -16787,8 +17015,10 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       VToTEs.insert(TEPtr);
     }
     if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
-      const auto *It = find_if(
-          VTEs, [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
+      const auto *It = find_if(VTEs, [&](const TreeEntry *MTE) {
+        return MTE != TE && MTE != TEUseEI.UserTE &&
+               !DeletedNodes.contains(MTE);
+      });
       if (It != VTEs.end()) {
         const TreeEntry *VTE = *It;
         if (none_of(TE->CombinedEntriesWithIndices,
@@ -16804,28 +17034,34 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       }
     }
     if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
-      const TreeEntry *VTE = VTEs.front();
-      if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
-          VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
-        VTEs = VTEs.drop_front();
-        // Iterate through all vectorized nodes.
-        const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
-          return MTE->State == TreeEntry::Vectorize;
-        });
-        if (MIt == VTEs.end())
-          continue;
-        VTE = *MIt;
-      }
-      if (none_of(TE->CombinedEntriesWithIndices,
-                  [&](const auto &P) { return P.first == VTE->Idx; })) {
-        Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
-        if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
-          continue;
+      const auto *It = find_if(VTEs, [&, MainTE = TE](const TreeEntry *TE) {
+        return TE != MainTE && !DeletedNodes.contains(TE) &&
+               !TransformedToGatherNodes.contains(TE);
+      });
+      if (It != VTEs.end()) {
+        const TreeEntry *VTE = *It;
+        if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0) &&
+            VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
+          VTEs = VTEs.drop_front();
+          // Iterate through all vectorized nodes.
+          const auto *MIt = find_if(VTEs, [](const TreeEntry *MTE) {
+            return MTE->State == TreeEntry::Vectorize;
+          });
+          if (MIt == VTEs.end())
+            continue;
+          VTE = *MIt;
+        }
+        if (none_of(TE->CombinedEntriesWithIndices,
+                    [&](const auto &P) { return P.first == VTE->Idx; })) {
+          Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
+          if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
+            continue;
+        }
+        // The node is reused - exit.
+        if (CheckAndUseSameNode(VTE))
+          break;
+        VToTEs.insert(VTE);
       }
-      // The node is reused - exit.
-      if (CheckAndUseSameNode(VTE))
-        break;
-      VToTEs.insert(VTE);
     }
     if (VToTEs.empty())
       continue;
@@ -17658,7 +17894,12 @@ Value *BoUpSLP::gather(
     CSEBlocks.insert(InsElt->getParent());
     // Add to our 'need-to-extract' list.
     if (isa<Instruction>(V)) {
-      if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
+      ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
+      const auto *It = find_if(Entries, [&](const TreeEntry *E) {
+        return !TransformedToGatherNodes.contains(E) &&
+           !DeletedNodes.contains(E);
+      });
+      if (It != Entries.end()) {
         // Find which lane we need to extract.
         User *UserOp = nullptr;
         if (Scalar != V) {
@@ -17690,8 +17931,8 @@ Value *BoUpSLP::gather(
           UserOp = InsElt;
         }
         if (UserOp) {
-          unsigned FoundLane = Entries.front()->findLaneForValue(V);
-          ExternalUses.emplace_back(V, UserOp, *Entries.front(), FoundLane);
+          unsigned FoundLane = (*It)->findLaneForValue(V);
+          ExternalUses.emplace_back(V, UserOp, **It, FoundLane);
         }
       }
     }
@@ -18312,7 +18553,8 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
 template <typename BVTy, typename ResTy, typename... Args>
 ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
                                   Args &...Params) {
-  assert(E->isGather() && "Expected gather node.");
+  assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
+         "Expected gather node.");
   unsigned VF = E->getVectorFactor();
 
   bool NeedFreeze = false;
@@ -18897,7 +19139,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   if (E->VectorizedValue)
     return E->VectorizedValue;
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
-  if (E->isGather()) {
+  if (E->isGather() || TransformedToGatherNodes.contains(E)) {
     // Set insert point for non-reduction initial nodes.
     if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
@@ -19966,7 +20208,7 @@ Value *BoUpSLP::vectorizeTree(
   // Cache last instructions for the nodes to avoid side effects, which may
   // appear during vectorization, like extra uses, etc.
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
-    if (TE->isGather())
+    if (TE->isGather() || DeletedNodes.contains(TE.get()))
       continue;
     (void)getLastInstructionInBundle(TE.get());
   }
@@ -19980,6 +20222,8 @@ Value *BoUpSLP::vectorizeTree(
   // Vectorize gather operands of the nodes with the external uses only.
   SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (DeletedNodes.contains(TE.get()))
+      continue;
     if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
         TE->UserTreeIndex.UserTE->hasState() &&
         TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
@@ -20002,6 +20246,8 @@ Value *BoUpSLP::vectorizeTree(
   // Emit gathered loads first to emit better code for the users of those
   // gathered loads.
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (DeletedNodes.contains(TE.get()))
+      continue;
     if (GatheredLoadsEntriesFirst.has_value() &&
         TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
         (!TE->isGather() || TE->UserTreeIndex)) {
@@ -20513,7 +20759,9 @@ Value *BoUpSLP::vectorizeTree(
     TreeEntry *Entry = TEPtr.get();
 
     // No need to handle users of gathered values.
-    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
+    if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
+        DeletedNodes.contains(Entry) ||
+        TransformedToGatherNodes.contains(Entry))
       continue;
 
     assert(Entry->VectorizedValue && "Can't find vectorizable value");
@@ -22718,14 +22966,15 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
     R.reorderBottomToTop();
   }
   R.transformNodes();
-  R.buildExternalUses();
-
   R.computeMinimumValueSizes();
 
+  InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
+  R.buildExternalUses();
+
   Size = R.getCanonicalGraphSize();
   if (S && S.getOpcode() == Instruction::Load)
     Size = 2; // cut off masked gather small trees
-  InstructionCost Cost = R.getTreeCost();
+  InstructionCost Cost = R.getTreeCost(TreeCost);
 
   LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
   if (Cost < -SLPCostThreshold) {
@@ -23373,10 +23622,11 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         R.reorderBottomToTop(!isa<InsertElementInst>(Ops.front()));
       }
       R.transformNodes();
+      R.computeMinimumValueSizes();
+      InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
       R.buildExternalUses();
 
-      R.computeMinimumValueSizes();
-      InstructionCost Cost = R.getTreeCost();
+      InstructionCost Cost = R.getTreeCost(TreeCost);
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
@@ -24311,6 +24561,9 @@ class HorizontalReduction {
           }
         }
         V.transformNodes();
+        V.computeMinimumValueSizes();
+        InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VL);
+
         SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
         // Gather externally used values.
         SmallPtrSet<Value *, 4> Visited;
@@ -24342,12 +24595,10 @@ class HorizontalReduction {
             LocalExternallyUsedValues.insert(RdxVal);
         V.buildExternalUses(LocalExternallyUsedValues);
 
-        V.computeMinimumValueSizes();
-
         // Estimate cost.
         InstructionCost ReductionCost =
             getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
-        InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
+        InstructionCost Cost = V.getTreeCost(TreeCost, VL, ReductionCost);
         LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
                           << " for reduction\n");
         if (!Cost.isValid())
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 07125b43e0575..541f2cbe29702 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -272,14 +272,13 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[B]], i64 0
+; CHECK-NEXT:    [[C:%.*]] = fsub double [[FNEG]], [[C1:%.*]]
+; CHECK-NEXT:    [[ADD:%.*]] = fsub double [[C1]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[ADD]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[C]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D)
 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
index e9cf1deac8eed..a71afc36a205e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/div.ll
@@ -527,21 +527,14 @@ define <2 x i32> @sdiv_v2i32_unknown_divisor(<2 x i32> %a, <2 x i32> %x, <2 x i3
 ; NO-SVE-NEXT:    [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
 ; NO-SVE-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
 ; NO-SVE-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
-; NO-SVE-NEXT:    [[TMP1:%.*]] = sdiv i32 [[A0]], [[X0]]
-; NO-SVE-NEXT:    [[TMP2:%.*]] = sdiv i32 [[A1]], [[X1]]
-; NO-SVE-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
-; NO-SVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
-; NO-SVE-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
-; NO-SVE-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
-; NO-SVE-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
-; NO-SVE-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
-; NO-SVE-NEXT:    [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
-; NO-SVE-NEXT:    [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
-; NO-SVE-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
-; NO-SVE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
+; NO-SVE-NEXT:    [[TMP8:%.*]] = sdiv i32 [[A1]], [[X1]]
+; NO-SVE-NEXT:    [[TMP7:%.*]] = sdiv i32 [[A0]], [[X0]]
 ; NO-SVE-NEXT:    [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
 ; NO-SVE-NEXT:    [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
-; NO-SVE-NEXT:    ret <2 x i32> [[RES1]]
+; NO-SVE-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[RES1]], [[X]]
+; NO-SVE-NEXT:    [[TMP6:%.*]] = sub <2 x i32> [[TMP5]], [[Y]]
+; NO-SVE-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[Z]]
+; NO-SVE-NEXT:    ret <2 x i32> [[TMP9]]
 ;
 ; SVE-LABEL: define <2 x i32> @sdiv_v2i32_unknown_divisor(
 ; SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
@@ -610,22 +603,13 @@ define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const(<2 x i32> %a, <2 x i32> %x, <
 ; NO-SVE-SAME: <2 x i32> [[A:%.*]], <2 x i32> [[X:%.*]], <2 x i32> [[Y:%.*]], <2 x i32> [[Z:%.*]]) #[[ATTR0]] {
 ; NO-SVE-NEXT:    [[A0:%.*]] = extractelement <2 x i32> [[A]], i64 0
 ; NO-SVE-NEXT:    [[A1:%.*]] = extractelement <2 x i32> [[A]], i64 1
-; NO-SVE-NEXT:    [[TMP1:%.*]] = sdiv i32 [[A0]], [[A0]]
 ; NO-SVE-NEXT:    [[TMP2:%.*]] = sdiv i32 [[A1]], 4
-; NO-SVE-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X]], i64 0
-; NO-SVE-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i64 1
-; NO-SVE-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1]], [[X0]]
-; NO-SVE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[X1]]
-; NO-SVE-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y]], i64 0
-; NO-SVE-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i64 1
-; NO-SVE-NEXT:    [[TMP5:%.*]] = sub i32 [[TMP3]], [[Y0]]
-; NO-SVE-NEXT:    [[TMP6:%.*]] = sub i32 [[TMP4]], [[Y1]]
-; NO-SVE-NEXT:    [[Z0:%.*]] = extractelement <2 x i32> [[Z]], i64 0
-; NO-SVE-NEXT:    [[Z1:%.*]] = extractelement <2 x i32> [[Z]], i64 1
-; NO-SVE-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP5]], [[Z0]]
-; NO-SVE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[Z1]]
+; NO-SVE-NEXT:    [[TMP7:%.*]] = sdiv i32 [[A0]], [[A0]]
 ; NO-SVE-NEXT:    [[RES0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; NO-SVE-NEXT:    [[RES1:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP8]], i32 1
+; NO-SVE-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[RES0]], i32 [[TMP2]], i32 1
+; NO-SVE-NEXT:    [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[X]]
+; NO-SVE-NEXT:    [[TMP6:%.*]] = sub <2 x i32> [[TMP5]], [[Y]]
+; NO-SVE-NEXT:    [[RES1:%.*]] = mul <2 x i32> [[TMP6]], [[Z]]
 ; NO-SVE-NEXT:    ret <2 x i32> [[RES1]]
 ;
 ; SVE-LABEL: define <2 x i32> @sdiv_v2i32_Op1_unknown_Op2_const(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index f397290299a4f..0ac3323e0a7b3 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -8,74 +8,40 @@ define void @h(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i
 ; CHECK-NEXT:    [[CONV9:%.*]] = zext i16 [[A]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
 ; CHECK-NEXT:    [[CONV310:%.*]] = zext i16 [[B]] to i32
-; CHECK-NEXT:    [[ADD4:%.*]] = or i32 [[CONV310]], [[CONV9]]
-; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[CONV310]]
-; CHECK-NEXT:    [[CONV15:%.*]] = sext i16 [[C]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24
-; CHECK-NEXT:    [[CONV19:%.*]] = sext i16 [[D]] to i32
-; CHECK-NEXT:    [[SUB20:%.*]] = or i32 [[SHR]], [[CONV19]]
-; CHECK-NEXT:    [[SHR29:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ADD30:%.*]] = or i32 [[SHR29]], [[CONV15]]
-; CHECK-NEXT:    [[SUB39:%.*]] = or i32 [[SUB]], [[SUB20]]
-; CHECK-NEXT:    [[CONV40:%.*]] = trunc i32 [[SUB39]] to i16
-; CHECK-NEXT:    store i16 [[CONV40]], ptr [[ARRAYIDX2]], align 2
-; CHECK-NEXT:    [[SUB44:%.*]] = or i32 [[ADD4]], [[ADD30]]
-; CHECK-NEXT:    [[CONV45:%.*]] = trunc i32 [[SUB44]] to i16
-; CHECK-NEXT:    store i16 [[CONV45]], ptr [[ARRAYIDX18]], align 2
-; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr i8, ptr null, i64 18
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[D]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[G]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[K]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[O]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[C]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[F]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[J]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[N]], i32 7
 ; CHECK-NEXT:    [[CONV3_112:%.*]] = zext i16 [[E]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[H]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[L]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i16> poison, i16 [[I]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i16> [[TMP10]], i16 [[M]], i32 1
 ; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[CONV3_112]], 0
-; CHECK-NEXT:    [[SUB_1:%.*]] = or i32 0, [[CONV3_112]]
-; CHECK-NEXT:    [[CONV15_1:%.*]] = sext i16 [[F]] to i32
-; CHECK-NEXT:    [[SHR_1:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ARRAYIDX18_1:%.*]] = getelementptr i8, ptr null, i64 26
-; CHECK-NEXT:    [[CONV19_1:%.*]] = sext i16 [[G]] to i32
-; CHECK-NEXT:    [[SUB20_1:%.*]] = or i32 [[SHR_1]], [[CONV19_1]]
-; CHECK-NEXT:    [[SHR29_1:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ADD30_1:%.*]] = or i32 [[SHR29_1]], [[CONV15_1]]
-; CHECK-NEXT:    [[SUB39_1:%.*]] = or i32 [[SUB_1]], [[SUB20_1]]
-; CHECK-NEXT:    [[CONV40_1:%.*]] = trunc i32 [[SUB39_1]] to i16
-; CHECK-NEXT:    store i16 [[CONV40_1]], ptr [[ARRAYIDX2_1]], align 2
-; CHECK-NEXT:    [[SUB44_1:%.*]] = or i32 [[ADD4_1]], [[ADD30_1]]
-; CHECK-NEXT:    [[CONV45_1:%.*]] = trunc i32 [[SUB44_1]] to i16
-; CHECK-NEXT:    store i16 [[CONV45_1]], ptr [[ARRAYIDX18_1]], align 2
-; CHECK-NEXT:    [[CONV_213:%.*]] = zext i16 [[H]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr null, i64 20
-; CHECK-NEXT:    [[CONV3_214:%.*]] = zext i16 [[I]] to i32
-; CHECK-NEXT:    [[ADD4_2:%.*]] = or i32 0, [[CONV_213]]
-; CHECK-NEXT:    [[SUB_2:%.*]] = or i32 0, [[CONV3_214]]
-; CHECK-NEXT:    [[CONV15_2:%.*]] = sext i16 [[J]] to i32
-; CHECK-NEXT:    [[SHR_2:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ARRAYIDX18_2:%.*]] = getelementptr i8, ptr null, i64 28
-; CHECK-NEXT:    [[CONV19_2:%.*]] = sext i16 [[K]] to i32
-; CHECK-NEXT:    [[SUB20_2:%.*]] = or i32 [[SHR_2]], [[CONV19_2]]
-; CHECK-NEXT:    [[SHR29_2:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ADD30_2:%.*]] = or i32 [[SHR29_2]], [[CONV15_2]]
-; CHECK-NEXT:    [[SUB39_2:%.*]] = or i32 [[SUB_2]], [[SUB20_2]]
+; CHECK-NEXT:    [[SUB39_3:%.*]] = or i32 [[CONV310]], [[CONV9]]
+; CHECK-NEXT:    [[SUB44_2:%.*]] = or i32 0, [[CONV3_112]]
+; CHECK-NEXT:    [[SUB39_2:%.*]] = or i32 [[CONV9]], [[CONV310]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> zeroinitializer, [[TMP7]]
 ; CHECK-NEXT:    [[CONV40_2:%.*]] = trunc i32 [[SUB39_2]] to i16
-; CHECK-NEXT:    store i16 [[CONV40_2]], ptr [[ARRAYIDX2_2]], align 2
-; CHECK-NEXT:    [[SUB44_2:%.*]] = or i32 [[ADD4_2]], [[ADD30_2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i16> poison, i16 [[CONV40_2]], i32 0
 ; CHECK-NEXT:    [[CONV45_2:%.*]] = trunc i32 [[SUB44_2]] to i16
-; CHECK-NEXT:    store i16 [[CONV45_2]], ptr [[ARRAYIDX18_2]], align 2
-; CHECK-NEXT:    [[CONV_315:%.*]] = zext i16 [[L]] to i32
-; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr null, i64 22
-; CHECK-NEXT:    [[CONV3_316:%.*]] = zext i16 [[M]] to i32
-; CHECK-NEXT:    [[ADD4_3:%.*]] = or i32 0, [[CONV_315]]
-; CHECK-NEXT:    [[SUB_3:%.*]] = or i32 0, [[CONV3_316]]
-; CHECK-NEXT:    [[CONV15_3:%.*]] = sext i16 [[N]] to i32
-; CHECK-NEXT:    [[SHR_3:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ARRAYIDX18_3:%.*]] = getelementptr i8, ptr null, i64 30
-; CHECK-NEXT:    [[CONV19_3:%.*]] = sext i16 [[O]] to i32
-; CHECK-NEXT:    [[SUB20_3:%.*]] = or i32 [[SHR_3]], [[CONV19_3]]
-; CHECK-NEXT:    [[SHR29_3:%.*]] = ashr i32 0, 0
-; CHECK-NEXT:    [[ADD30_3:%.*]] = or i32 [[SHR29_3]], [[CONV15_3]]
-; CHECK-NEXT:    [[SUB39_3:%.*]] = or i32 [[SUB_3]], [[SUB20_3]]
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[CONV45_2]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = or <2 x i16> zeroinitializer, [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP16]], <8 x i16> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[CONV40_3:%.*]] = trunc i32 [[SUB39_3]] to i16
-; CHECK-NEXT:    store i16 [[CONV40_3]], ptr [[ARRAYIDX2_3]], align 2
-; CHECK-NEXT:    [[SUB44_3:%.*]] = or i32 [[ADD4_3]], [[ADD30_3]]
-; CHECK-NEXT:    [[CONV45_3:%.*]] = trunc i32 [[SUB44_3]] to i16
-; CHECK-NEXT:    store i16 [[CONV45_3]], ptr [[ARRAYIDX18_3]], align 2
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x i16> [[TMP19]], i16 [[CONV40_3]], i32 4
+; CHECK-NEXT:    [[TMP22:%.*]] = trunc i32 [[ADD4_1]] to i16
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <8 x i16> [[TMP21]], i16 [[TMP22]], i32 5
+; CHECK-NEXT:    [[TMP24:%.*]] = or <2 x i16> zeroinitializer, [[TMP9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP24]], <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <8 x i16> [[TMP23]], <8 x i16> [[TMP25]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP27:%.*]] = or <8 x i16> [[TMP26]], [[TMP12]]
+; CHECK-NEXT:    store <8 x i16> [[TMP27]], ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 26ce0fc6e6a3b..ea2e27599161d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -40,26 +40,28 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP13]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP15]], float [[I68]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[I66]], i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x float> [[TMP17]], float [[I67]], i32 6
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP18]], float [[I69]], i32 7
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[I66]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x float> [[TMP0]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x float> [[TMP20]], <16 x float> [[TMP0]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
 ; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP14]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = phi <8 x float> [ [[TMP19]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP32:%.*]] = phi <2 x float> [ [[TMP16]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <8 x i32> <i32 2, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x float> [[TMP32]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <8 x float> [[TMP38]], <8 x float> [[TMP23]], <8 x i32> <i32 0, i32 9, i32 2, i32 8, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP40]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP41]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
 ; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = fmul fast <16 x float> [[TMP25]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
 ; CHECK-NEXT:    [[TMP30:%.*]] = fadd fast <16 x float> [[TMP29]], poison
 ; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP32]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP37]] = shufflevector <16 x float> [[TMP30]], <16 x float> poison, <2 x i32> <i32 6, i32 7>
 ; CHECK-NEXT:    br i1 poison, label %[[BB78]], label %[[BB167]]
 ; CHECK:       [[BB167]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = phi <16 x float> [ [[TMP11]], %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 5ebe44206c702..a31cd4301524d 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -80,21 +80,23 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
 ; CHECK-NEXT:    [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
 ; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP115]], i32 0
-; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP117:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP117]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
+; CHECK-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[TMP69:%.*]] = insertelement <2 x i8> poison, i8 [[TMP115]], i32 0
+; CHECK-NEXT:    [[TMP70:%.*]] = insertelement <2 x i8> [[TMP69]], i8 [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP73:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1
 ; CHECK-NEXT:    [[TMP74:%.*]] = zext <4 x i8> [[TMP73]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP75:%.*]] = shufflevector <4 x i32> [[TMP74]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP117:%.*]] = zext <2 x i8> [[TMP70]] to <2 x i32>
+; CHECK-NEXT:    [[TMP119:%.*]] = shufflevector <2 x i32> [[TMP117]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
+; CHECK-NEXT:    [[TMP121:%.*]] = shufflevector <2 x i32> [[TMP120]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP72:%.*]] = shufflevector <4 x i32> [[TMP119]], <4 x i32> [[TMP121]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; CHECK-NEXT:    [[TMP76:%.*]] = sub <4 x i32> [[TMP72]], [[TMP75]]
 ; CHECK-NEXT:    [[TMP77:%.*]] = shl <4 x i32> [[TMP76]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP78:%.*]] = add <4 x i32> [[TMP77]], [[TMP68]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
index 085d7a64fc9ac..2b79ca9429fa3 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/partial-vec-invalid-cost.ll
@@ -9,15 +9,13 @@ define void @partial_vec_invalid_cost() #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LSHR_1:%.*]] = lshr i96 0, 0
 ; CHECK-NEXT:    [[LSHR_2:%.*]] = lshr i96 0, 0
-; CHECK-NEXT:    [[TRUNC_I96_1:%.*]] = trunc i96 [[LSHR_1]] to i32
-; CHECK-NEXT:    [[TRUNC_I96_2:%.*]] = trunc i96 [[LSHR_2]] to i32
-; CHECK-NEXT:    [[TRUNC_I96_3:%.*]] = trunc i96 0 to i32
-; CHECK-NEXT:    [[TRUNC_I96_4:%.*]] = trunc i96 0 to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i32 [[TMP1]], [[TRUNC_I96_1]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = or i32 [[TRUNC_I96_2]], [[TRUNC_I96_3]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = or i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = or i32 [[OP_RDX2]], [[TRUNC_I96_4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i96> poison, i96 [[LSHR_1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i96> [[TMP0]], i96 [[LSHR_2]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i96> [[TMP1]], i96 0, i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i96> [[TMP2]], i96 0, i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i96> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[RDX_OP:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[OP_RDX3:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[RDX_OP]])
 ; CHECK-NEXT:    [[STORE_THIS:%.*]] = zext i32 [[OP_RDX3]] to i96
 ; CHECK-NEXT:    store i96 [[STORE_THIS]], ptr null, align 16
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
index d4e323819402c..aaf290ba952f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll
@@ -101,81 +101,82 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72
 ; THRESH-NEXT:    [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312:%.*]] = extractelement <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], i64 0
 ; THRESH-NEXT:    [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1
-; THRESH-NEXT:    [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16
 ; THRESH-NEXT:    [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2
 ; THRESH-NEXT:    [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2
 ; THRESH-NEXT:    [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16
-; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1
+; THRESH-NEXT:    [[ADD2235_I17:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2190_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
 ; THRESH-NEXT:    [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]]
 ; THRESH-NEXT:    [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16
-; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1
+; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
+; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
+; THRESH-NEXT:    [[ADD2302_I1:%.*]] = add i32 [[TMP0]], 1
+; THRESH-NEXT:    [[ADD2203_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
+; THRESH-NEXT:    [[SHR2303_I1:%.*]] = lshr i32 [[ADD2302_I1]], 1
 ; THRESH-NEXT:    [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]]
+; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I1]] to i16
+; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
+; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
+; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
 ; THRESH-NEXT:    [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
+; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
+; THRESH-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> poison, i16 [[CONV2206_I]], i32 0
+; THRESH-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[CONV2304_I]], i32 3
+; THRESH-NEXT:    store <4 x i16> [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
+; THRESH-NEXT:    [[ADD2190_I1:%.*]] = or i32 [[ADD1392_I]], 1
+; THRESH-NEXT:    [[ADD2236_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
 ; THRESH-NEXT:    [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1
+; THRESH-NEXT:    [[ADD2258_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
+; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[ADD111_I_I]], 1
+; THRESH-NEXT:    [[SHR2325_I:%.*]] = add i32 [[ADD2190_I1]], [[TMP0]]
 ; THRESH-NEXT:    [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1
-; THRESH-NEXT:    [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4
-; THRESH-NEXT:    store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8
-; THRESH-NEXT:    [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]]
+; THRESH-NEXT:    [[SHR2343_I:%.*]] = add i32 [[ADD2235_I16]], [[TMP0]]
 ; THRESH-NEXT:    [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1
-; THRESH-NEXT:    [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8
-; THRESH-NEXT:    store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4
-; THRESH-NEXT:    [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1
 ; THRESH-NEXT:    [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1
-; THRESH-NEXT:    [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4
-; THRESH-NEXT:    store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8
-; THRESH-NEXT:    [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1
-; THRESH-NEXT:    [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]]
-; THRESH-NEXT:    [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1
 ; THRESH-NEXT:    [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4
-; THRESH-NEXT:    store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8
-; THRESH-NEXT:    [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1
+; THRESH-NEXT:    [[CONV2326_I1:%.*]] = trunc i32 [[SHR2237_I]] to i16
 ; THRESH-NEXT:    [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8
-; THRESH-NEXT:    [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1
-; THRESH-NEXT:    [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8
-; THRESH-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1)
-; THRESH-NEXT:    [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1)
-; THRESH-NEXT:    [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
-; THRESH-NEXT:    store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4
-; THRESH-NEXT:    [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1
-; THRESH-NEXT:    [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]]
-; THRESH-NEXT:    [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2
-; THRESH-NEXT:    store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2
-; THRESH-NEXT:    store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2
-; THRESH-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> <i32 poison, i32 0>
-; THRESH-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0
-; THRESH-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1)
-; THRESH-NEXT:    [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1)
-; THRESH-NEXT:    [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16>
-; THRESH-NEXT:    [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
-; THRESH-NEXT:    store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
-; THRESH-NEXT:    store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2
-; THRESH-NEXT:    store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2
-; THRESH-NEXT:    store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2
-; THRESH-NEXT:    store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2
+; THRESH-NEXT:    [[CONV2344_I1:%.*]] = trunc i32 [[SHR2259_I]] to i16
+; THRESH-NEXT:    [[CONV2282_I:%.*]] = trunc i32 [[SHR2303_I]] to i16
+; THRESH-NEXT:    store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4
+; THRESH-NEXT:    [[ADD2236_I1:%.*]] = add i32 [[ADD2235_I17]], 1
+; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[ADD111_I_I]], i32 0
+; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1
+; THRESH-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; THRESH-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> zeroinitializer
+; THRESH-NEXT:    [[TMP17:%.*]] = or <2 x i32> [[TMP14]], [[TMP16]]
+; THRESH-NEXT:    [[ADD2157_I:%.*]] = add i32 [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312]], 1
+; THRESH-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> poison, i32 [[ADD2157_I]], i32 0
+; THRESH-NEXT:    [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i32> [[TMP18]], <4 x i32> [[TMP19]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; THRESH-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD2236_I1]], i32 3
+; THRESH-NEXT:    [[TMP22:%.*]] = lshr <4 x i32> [[TMP21]], splat (i32 1)
+; THRESH-NEXT:    [[TMP23:%.*]] = trunc <4 x i32> [[TMP22]] to <4 x i16>
+; THRESH-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[TMP23]], i32 0
+; THRESH-NEXT:    store i16 [[TMP24]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8230), align 2
+; THRESH-NEXT:    store <4 x i16> [[TMP23]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2
+; THRESH-NEXT:    [[TMP25:%.*]] = insertelement <8 x i16> poison, i16 [[CONV2282_I]], i32 0
+; THRESH-NEXT:    [[TMP26:%.*]] = shufflevector <4 x i16> [[TMP23]], <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP25]], <8 x i16> [[TMP26]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; THRESH-NEXT:    [[TMP28:%.*]] = insertelement <8 x i16> [[TMP27]], i16 [[CONV2206_I]], i32 5
+; THRESH-NEXT:    [[TMP29:%.*]] = insertelement <8 x i16> [[TMP28]], i16 [[CONV2326_I1]], i32 6
+; THRESH-NEXT:    [[TMP30:%.*]] = insertelement <8 x i16> [[TMP29]], i16 [[CONV2326_I]], i32 7
+; THRESH-NEXT:    store <8 x i16> [[TMP30]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4
+; THRESH-NEXT:    [[TMP31:%.*]] = insertelement <8 x i16> [[TMP27]], i16 [[CONV2344_I1]], i32 4
+; THRESH-NEXT:    [[TMP32:%.*]] = insertelement <8 x i16> [[TMP31]], i16 [[CONV2344_I]], i32 5
+; THRESH-NEXT:    [[TMP33:%.*]] = insertelement <8 x i16> [[TMP32]], i16 [[CONV2326_I1]], i32 6
+; THRESH-NEXT:    [[TMP34:%.*]] = insertelement <8 x i16> [[TMP33]], i16 [[CONV2326_I]], i32 7
+; THRESH-NEXT:    store <8 x i16> [[TMP34]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8
 ; THRESH-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
index 194c7021f60f5..c155702c62830 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll
@@ -4,20 +4,18 @@
 define void @mainTest(i32 %param, ptr %vals, i32 %len) {
 ; CHECK-LABEL: @mainTest(
 ; CHECK-NEXT:  bci_15.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 31>, i32 [[PARAM:%.*]], i32 0
 ; CHECK-NEXT:    br label [[BCI_15:%.*]]
 ; CHECK:       bci_15:
-; CHECK-NEXT:    [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM:%.*]], [[BCI_15_PREHEADER:%.*]] ]
 ; CHECK-NEXT:    [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[LOCAL_0_]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[LOCAL_4_]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 -1, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]]
 ; CHECK-NEXT:    [[V44]] = add i32 [[LOCAL_4_]], 16
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0
-; CHECK-NEXT:    [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1
 ; CHECK-NEXT:    br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
index d02df1ac92b4d..9b45fe6a2804b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll
@@ -282,23 +282,26 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R51:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index d9a7586ecd23d..d812cc813c20f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -282,23 +282,26 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) {
 
 define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-LABEL: @ashr_lshr_shl_v8i32(
-; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 6
+; SSE-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 4
+; SSE-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6
+; SSE-NEXT:    [[B4:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 4
+; SSE-NEXT:    [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5
+; SSE-NEXT:    [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6
 ; SSE-NEXT:    [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:    [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], [[TMP10]]
+; SSE-NEXT:    [[AB4:%.*]] = lshr i32 [[A4]], [[B4]]
+; SSE-NEXT:    [[AB5:%.*]] = lshr i32 [[A5]], [[B5]]
 ; SSE-NEXT:    [[AB6:%.*]] = shl i32 [[A6]], [[B6]]
 ; SSE-NEXT:    [[AB7:%.*]] = shl i32 [[A7]], [[B7]]
 ; SSE-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[AB4]], i32 4
+; SSE-NEXT:    [[R51:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
 ; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i32 6
 ; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
 ; SSE-NEXT:    ret <8 x i32> [[R7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
index 7f7e77eadc987..57deca1d62516 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp-inseltpoison.ll
@@ -607,25 +607,38 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> poison, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R73:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
 ; SLM-NEXT:    ret <8 x double> [[R73]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
index 8b8bc71c2ceda..d1a5c3bb032e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fp.ll
@@ -607,25 +607,38 @@ define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
 ; SSE-NEXT:    ret <8 x double> [[TMP1]]
 ;
 ; SLM-LABEL: @buildvector_div_8f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> poison, <2 x i32> <i32 0, i32 1>
-; SLM-NEXT:    [[TMP3:%.*]] = fdiv <2 x double> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 2, i32 3>
-; SLM-NEXT:    [[TMP6:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP8:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 4, i32 5>
-; SLM-NEXT:    [[TMP9:%.*]] = fdiv <2 x double> [[TMP7]], [[TMP8]]
-; SLM-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[A]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP11:%.*]] = shufflevector <8 x double> [[B]], <8 x double> poison, <2 x i32> <i32 6, i32 7>
-; SLM-NEXT:    [[TMP12:%.*]] = fdiv <2 x double> [[TMP10]], [[TMP11]]
-; SLM-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R31:%.*]] = shufflevector <8 x double> [[TMP13]], <8 x double> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R52:%.*]] = shufflevector <8 x double> [[R31]], <8 x double> [[TMP15]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SLM-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SLM-NEXT:    [[R73:%.*]] = shufflevector <8 x double> [[R52]], <8 x double> [[TMP16]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
+; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0
+; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1
+; SLM-NEXT:    [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2
+; SLM-NEXT:    [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
+; SLM-NEXT:    [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
+; SLM-NEXT:    [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
+; SLM-NEXT:    [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
+; SLM-NEXT:    [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
+; SLM-NEXT:    [[C0:%.*]] = fdiv double [[A0]], [[B0]]
+; SLM-NEXT:    [[C1:%.*]] = fdiv double [[A1]], [[B1]]
+; SLM-NEXT:    [[C2:%.*]] = fdiv double [[A2]], [[B2]]
+; SLM-NEXT:    [[C3:%.*]] = fdiv double [[A3]], [[B3]]
+; SLM-NEXT:    [[C4:%.*]] = fdiv double [[A4]], [[B4]]
+; SLM-NEXT:    [[C5:%.*]] = fdiv double [[A5]], [[B5]]
+; SLM-NEXT:    [[C6:%.*]] = fdiv double [[A6]], [[B6]]
+; SLM-NEXT:    [[C7:%.*]] = fdiv double [[A7]], [[B7]]
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
+; SLM-NEXT:    [[R73:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
 ; SLM-NEXT:    ret <8 x double> [[R73]]
 ;
 ; AVX-LABEL: @buildvector_div_8f64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll
index f8522bc546e6b..3c2472c2ab58d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll
@@ -4,9 +4,15 @@
 define void @b() {
 ; CHECK-LABEL: @b(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x float> poison, float 0x7FF8000000000000, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0x7FF8000000000000, i32 3
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float undef, 2.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float undef, 1.000000e+00
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[ADD]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[MUL]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x float> [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> poison, float [[MUL]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> [[TMP9]], float [[ADD]], i32 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> zeroinitializer, <4 x float> zeroinitializer)
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul <4 x float> [[TMP3]], <float undef, float undef, float undef, float 2.000000e+00>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <4 x float> [[TMP4]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll
index e3c134b068e04..16f31e3655de1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvectors-parent-phi-nodes.ll
@@ -5,21 +5,21 @@ define void @test(ptr %0, float %1) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr [[TMP0:%.*]], float [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP3]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label %[[BB8:.*]]
-; CHECK:       [[BB8]]:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB8]] ], [ [[TMP5]], [[TMP2:%.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP7]], %[[BB8]] ], [ [[TMP4]], [[TMP2]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    br label %[[BB5:.*]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi float [ [[TMP1]], %[[BB5]] ], [ [[TMP3]], [[TMP2:%.*]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi float [ [[TMP1]], %[[BB5]] ], [ 0.000000e+00, [[TMP2]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP15:%.*]], %[[BB5]] ], [ [[TMP5]], [[TMP2]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <4 x float> [[TMP9]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd <4 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    store <4 x float> [[TMP13]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    br label %[[BB8]]
+; CHECK-NEXT:    br label %[[BB5]]
 ;
   %3 = load float, ptr %0, align 4
   br label %4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
index f1b094e9bbed4..d23e54f3495bd 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=AVX2
 
 %struct.ray = type { %struct.vec3, %struct.vec3 }
 %struct.vec3 = type { double, double, double }
@@ -9,86 +9,246 @@
 %struct.material = type { %struct.vec3, double, double }
 
 define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture noundef readonly byval(%struct.ray) align 8 %ray, ptr nocapture noundef readnone %sp) {
-; CHECK-LABEL: @ray_sphere(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[DIR]], align 8
-; CHECK-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[Y]], align 8
-; CHECK-NEXT:    [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]])
-; CHECK-NEXT:    [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[Z]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]])
-; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[RAY]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8
-; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00
-; CHECK-NEXT:    [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load double, ptr [[Y19]], align 8
-; CHECK-NEXT:    [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = load double, ptr [[Y21]], align 8
-; CHECK-NEXT:    [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]]
-; CHECK-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]])
-; CHECK-NEXT:    [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00
-; CHECK-NEXT:    [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = load double, ptr [[Z28]], align 8
-; CHECK-NEXT:    [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = load double, ptr [[Z30]], align 8
-; CHECK-NEXT:    [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]])
-; CHECK-NEXT:    [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]])
-; CHECK-NEXT:    [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]])
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]])
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]])
-; CHECK-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]])
-; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[TMP6]]
-; CHECK-NEXT:    [[TMP18:%.*]] = fneg double [[TMP8]]
-; CHECK-NEXT:    [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]])
-; CHECK-NEXT:    [[NEG78:%.*]] = fneg double [[TMP11]]
-; CHECK-NEXT:    [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]])
-; CHECK-NEXT:    [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP22:%.*]] = load double, ptr [[RAD]], align 8
-; CHECK-NEXT:    [[NEG82:%.*]] = fneg double [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]])
-; CHECK-NEXT:    [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00
-; CHECK-NEXT:    [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
-; CHECK-NEXT:    [[FNEG87:%.*]] = fneg double [[TMP12]]
-; CHECK-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
-; CHECK-NEXT:    [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1
-; CHECK-NEXT:    [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0
-; CHECK-NEXT:    [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D
-; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
-; CHECK-NEXT:    br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
-; CHECK:       lor.lhs.false:
-; CHECK-NEXT:    [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
-; CHECK-NEXT:    [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]]
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ]
-; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+; SSE2-LABEL: @ray_sphere(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1
+; SSE2-NEXT:    [[TMP0:%.*]] = load double, ptr [[DIR]], align 8
+; SSE2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1
+; SSE2-NEXT:    [[TMP1:%.*]] = load double, ptr [[Y]], align 8
+; SSE2-NEXT:    [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]]
+; SSE2-NEXT:    [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]])
+; SSE2-NEXT:    [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2
+; SSE2-NEXT:    [[TMP3:%.*]] = load double, ptr [[Z]], align 8
+; SSE2-NEXT:    [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]])
+; SSE2-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00
+; SSE2-NEXT:    [[TMP5:%.*]] = load double, ptr [[RAY]], align 8
+; SSE2-NEXT:    [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8
+; SSE2-NEXT:    [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00
+; SSE2-NEXT:    [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1
+; SSE2-NEXT:    [[TMP7:%.*]] = load double, ptr [[Y19]], align 8
+; SSE2-NEXT:    [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1
+; SSE2-NEXT:    [[TMP8:%.*]] = load double, ptr [[Y21]], align 8
+; SSE2-NEXT:    [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]]
+; SSE2-NEXT:    [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]]
+; SSE2-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]])
+; SSE2-NEXT:    [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00
+; SSE2-NEXT:    [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2
+; SSE2-NEXT:    [[TMP10:%.*]] = load double, ptr [[Z28]], align 8
+; SSE2-NEXT:    [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2
+; SSE2-NEXT:    [[TMP11:%.*]] = load double, ptr [[Z30]], align 8
+; SSE2-NEXT:    [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]]
+; SSE2-NEXT:    [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]])
+; SSE2-NEXT:    [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; SSE2-NEXT:    [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]])
+; SSE2-NEXT:    [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]])
+; SSE2-NEXT:    [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]])
+; SSE2-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]])
+; SSE2-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]])
+; SSE2-NEXT:    [[FNEG:%.*]] = fneg double [[TMP6]]
+; SSE2-NEXT:    [[TMP18:%.*]] = fneg double [[TMP8]]
+; SSE2-NEXT:    [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]]
+; SSE2-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]])
+; SSE2-NEXT:    [[NEG78:%.*]] = fneg double [[TMP11]]
+; SSE2-NEXT:    [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]])
+; SSE2-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]])
+; SSE2-NEXT:    [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1
+; SSE2-NEXT:    [[TMP22:%.*]] = load double, ptr [[RAD]], align 8
+; SSE2-NEXT:    [[NEG82:%.*]] = fneg double [[TMP22]]
+; SSE2-NEXT:    [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]])
+; SSE2-NEXT:    [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00
+; SSE2-NEXT:    [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]]
+; SSE2-NEXT:    [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]])
+; SSE2-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00
+; SSE2-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
+; SSE2:       if.end:
+; SSE2-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
+; SSE2-NEXT:    [[FNEG87:%.*]] = fneg double [[TMP12]]
+; SSE2-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
+; SSE2-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1
+; SSE2-NEXT:    [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1
+; SSE2-NEXT:    [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]]
+; SSE2-NEXT:    [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
+; SSE2-NEXT:    [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer
+; SSE2-NEXT:    [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]]
+; SSE2-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1
+; SSE2-NEXT:    [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D
+; SSE2-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0
+; SSE2-NEXT:    [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D
+; SSE2-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
+; SSE2-NEXT:    br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
+; SSE2:       lor.lhs.false:
+; SSE2-NEXT:    [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], splat (double 1.000000e+00)
+; SSE2-NEXT:    [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0
+; SSE2-NEXT:    [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1
+; SSE2-NEXT:    [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]]
+; SSE2-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
+; SSE2-NEXT:    br label [[CLEANUP]]
+; SSE2:       cleanup:
+; SSE2-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ]
+; SSE2-NEXT:    ret i32 [[RETVAL_0]]
+;
+; AVX-LABEL: @ray_sphere(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1
+; AVX-NEXT:    [[TMP0:%.*]] = load double, ptr [[DIR]], align 8
+; AVX-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1
+; AVX-NEXT:    [[TMP1:%.*]] = load double, ptr [[Y]], align 8
+; AVX-NEXT:    [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]]
+; AVX-NEXT:    [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]])
+; AVX-NEXT:    [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2
+; AVX-NEXT:    [[TMP3:%.*]] = load double, ptr [[Z]], align 8
+; AVX-NEXT:    [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]])
+; AVX-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00
+; AVX-NEXT:    [[TMP5:%.*]] = load double, ptr [[RAY]], align 8
+; AVX-NEXT:    [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8
+; AVX-NEXT:    [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00
+; AVX-NEXT:    [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1
+; AVX-NEXT:    [[TMP7:%.*]] = load double, ptr [[Y19]], align 8
+; AVX-NEXT:    [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1
+; AVX-NEXT:    [[TMP8:%.*]] = load double, ptr [[Y21]], align 8
+; AVX-NEXT:    [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]]
+; AVX-NEXT:    [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]]
+; AVX-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]])
+; AVX-NEXT:    [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00
+; AVX-NEXT:    [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2
+; AVX-NEXT:    [[TMP10:%.*]] = load double, ptr [[Z28]], align 8
+; AVX-NEXT:    [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2
+; AVX-NEXT:    [[TMP11:%.*]] = load double, ptr [[Z30]], align 8
+; AVX-NEXT:    [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]]
+; AVX-NEXT:    [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]])
+; AVX-NEXT:    [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; AVX-NEXT:    [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]])
+; AVX-NEXT:    [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]])
+; AVX-NEXT:    [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]])
+; AVX-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]])
+; AVX-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]])
+; AVX-NEXT:    [[FNEG:%.*]] = fneg double [[TMP6]]
+; AVX-NEXT:    [[TMP18:%.*]] = fneg double [[TMP8]]
+; AVX-NEXT:    [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]]
+; AVX-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]])
+; AVX-NEXT:    [[NEG78:%.*]] = fneg double [[TMP11]]
+; AVX-NEXT:    [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]])
+; AVX-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]])
+; AVX-NEXT:    [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1
+; AVX-NEXT:    [[TMP22:%.*]] = load double, ptr [[RAD]], align 8
+; AVX-NEXT:    [[NEG82:%.*]] = fneg double [[TMP22]]
+; AVX-NEXT:    [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]])
+; AVX-NEXT:    [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00
+; AVX-NEXT:    [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]]
+; AVX-NEXT:    [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]])
+; AVX-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00
+; AVX-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
+; AVX:       if.end:
+; AVX-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
+; AVX-NEXT:    [[FNEG87:%.*]] = fneg double [[TMP12]]
+; AVX-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
+; AVX-NEXT:    [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]]
+; AVX-NEXT:    [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]]
+; AVX-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[SUB90]], i32 0
+; AVX-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[ADD]], i32 1
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
+; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> poison, <2 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP30:%.*]] = fdiv <2 x double> [[TMP27]], [[TMP29]]
+; AVX-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP30]], i32 1
+; AVX-NEXT:    [[CMP93:%.*]] = fcmp olt double [[TMP31]], 0x3EB0C6F7A0B5ED8D
+; AVX-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP30]], i32 0
+; AVX-NEXT:    [[CMP94:%.*]] = fcmp olt double [[TMP32]], 0x3EB0C6F7A0B5ED8D
+; AVX-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
+; AVX-NEXT:    br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
+; AVX:       lor.lhs.false:
+; AVX-NEXT:    [[TMP33:%.*]] = fcmp ule <2 x double> [[TMP30]], splat (double 1.000000e+00)
+; AVX-NEXT:    [[TMP34:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0
+; AVX-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1
+; AVX-NEXT:    [[OR_COND106:%.*]] = select i1 [[TMP35]], i1 true, i1 [[TMP34]]
+; AVX-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
+; AVX-NEXT:    br label [[CLEANUP]]
+; AVX:       cleanup:
+; AVX-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ]
+; AVX-NEXT:    ret i32 [[RETVAL_0]]
+;
+; AVX2-LABEL: @ray_sphere(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[DIR:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr [[RAY:%.*]], i64 0, i32 1
+; AVX2-NEXT:    [[TMP0:%.*]] = load double, ptr [[DIR]], align 8
+; AVX2-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 1
+; AVX2-NEXT:    [[TMP1:%.*]] = load double, ptr [[Y]], align 8
+; AVX2-NEXT:    [[MUL6:%.*]] = fmul double [[TMP1]], [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP0]], double [[TMP0]], double [[MUL6]])
+; AVX2-NEXT:    [[Z:%.*]] = getelementptr inbounds [[STRUCT_RAY]], ptr [[RAY]], i64 0, i32 1, i32 2
+; AVX2-NEXT:    [[TMP3:%.*]] = load double, ptr [[Z]], align 8
+; AVX2-NEXT:    [[TMP4:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP3]], double [[TMP3]], double [[TMP2]])
+; AVX2-NEXT:    [[MUL:%.*]] = fmul double [[TMP0]], 2.000000e+00
+; AVX2-NEXT:    [[TMP5:%.*]] = load double, ptr [[RAY]], align 8
+; AVX2-NEXT:    [[TMP6:%.*]] = load double, ptr [[SPH:%.*]], align 8
+; AVX2-NEXT:    [[SUB:%.*]] = fsub double [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[MUL17:%.*]] = fmul double [[TMP1]], 2.000000e+00
+; AVX2-NEXT:    [[Y19:%.*]] = getelementptr inbounds [[STRUCT_VEC3:%.*]], ptr [[RAY]], i64 0, i32 1
+; AVX2-NEXT:    [[TMP7:%.*]] = load double, ptr [[Y19]], align 8
+; AVX2-NEXT:    [[Y21:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 1
+; AVX2-NEXT:    [[TMP8:%.*]] = load double, ptr [[Y21]], align 8
+; AVX2-NEXT:    [[SUB22:%.*]] = fsub double [[TMP7]], [[TMP8]]
+; AVX2-NEXT:    [[MUL23:%.*]] = fmul double [[MUL17]], [[SUB22]]
+; AVX2-NEXT:    [[TMP9:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL]], double [[SUB]], double [[MUL23]])
+; AVX2-NEXT:    [[MUL26:%.*]] = fmul double [[TMP3]], 2.000000e+00
+; AVX2-NEXT:    [[Z28:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[RAY]], i64 0, i32 2
+; AVX2-NEXT:    [[TMP10:%.*]] = load double, ptr [[Z28]], align 8
+; AVX2-NEXT:    [[Z30:%.*]] = getelementptr inbounds [[STRUCT_VEC3]], ptr [[SPH]], i64 0, i32 2
+; AVX2-NEXT:    [[TMP11:%.*]] = load double, ptr [[Z30]], align 8
+; AVX2-NEXT:    [[SUB31:%.*]] = fsub double [[TMP10]], [[TMP11]]
+; AVX2-NEXT:    [[TMP12:%.*]] = tail call double @llvm.fmuladd.f64(double [[MUL26]], double [[SUB31]], double [[TMP9]])
+; AVX2-NEXT:    [[MUL42:%.*]] = fmul double [[TMP8]], [[TMP8]]
+; AVX2-NEXT:    [[TMP13:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP6]], double [[TMP6]], double [[MUL42]])
+; AVX2-NEXT:    [[TMP14:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP11]], double [[TMP11]], double [[TMP13]])
+; AVX2-NEXT:    [[TMP15:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP5]], double [[TMP5]], double [[TMP14]])
+; AVX2-NEXT:    [[TMP16:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP7]], double [[TMP7]], double [[TMP15]])
+; AVX2-NEXT:    [[TMP17:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP10]], double [[TMP10]], double [[TMP16]])
+; AVX2-NEXT:    [[FNEG:%.*]] = fneg double [[TMP6]]
+; AVX2-NEXT:    [[TMP18:%.*]] = fneg double [[TMP8]]
+; AVX2-NEXT:    [[NEG:%.*]] = fmul double [[TMP7]], [[TMP18]]
+; AVX2-NEXT:    [[TMP19:%.*]] = tail call double @llvm.fmuladd.f64(double [[FNEG]], double [[TMP5]], double [[NEG]])
+; AVX2-NEXT:    [[NEG78:%.*]] = fneg double [[TMP11]]
+; AVX2-NEXT:    [[TMP20:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG78]], double [[TMP10]], double [[TMP19]])
+; AVX2-NEXT:    [[TMP21:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP20]], double 2.000000e+00, double [[TMP17]])
+; AVX2-NEXT:    [[RAD:%.*]] = getelementptr inbounds [[STRUCT_SPHERE:%.*]], ptr [[SPH]], i64 0, i32 1
+; AVX2-NEXT:    [[TMP22:%.*]] = load double, ptr [[RAD]], align 8
+; AVX2-NEXT:    [[NEG82:%.*]] = fneg double [[TMP22]]
+; AVX2-NEXT:    [[TMP23:%.*]] = tail call double @llvm.fmuladd.f64(double [[NEG82]], double [[TMP22]], double [[TMP21]])
+; AVX2-NEXT:    [[TMP24:%.*]] = fmul double [[TMP4]], -4.000000e+00
+; AVX2-NEXT:    [[NEG86:%.*]] = fmul double [[TMP24]], [[TMP23]]
+; AVX2-NEXT:    [[TMP25:%.*]] = tail call double @llvm.fmuladd.f64(double [[TMP12]], double [[TMP12]], double [[NEG86]])
+; AVX2-NEXT:    [[CMP:%.*]] = fcmp olt double [[TMP25]], 0.000000e+00
+; AVX2-NEXT:    br i1 [[CMP]], label [[CLEANUP:%.*]], label [[IF_END:%.*]]
+; AVX2:       if.end:
+; AVX2-NEXT:    [[CALL:%.*]] = tail call double @sqrt(double noundef [[TMP25]])
+; AVX2-NEXT:    [[FNEG87:%.*]] = fneg double [[TMP12]]
+; AVX2-NEXT:    [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00
+; AVX2-NEXT:    [[ADD:%.*]] = fsub double [[CALL]], [[TMP12]]
+; AVX2-NEXT:    [[SUB90:%.*]] = fsub double [[FNEG87]], [[CALL]]
+; AVX2-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> poison, double [[SUB90]], i32 0
+; AVX2-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[ADD]], i32 1
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0
+; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> poison, <2 x i32> zeroinitializer
+; AVX2-NEXT:    [[TMP30:%.*]] = fdiv <2 x double> [[TMP27]], [[TMP29]]
+; AVX2-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[TMP30]], i32 1
+; AVX2-NEXT:    [[CMP93:%.*]] = fcmp olt double [[TMP31]], 0x3EB0C6F7A0B5ED8D
+; AVX2-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP30]], i32 0
+; AVX2-NEXT:    [[CMP94:%.*]] = fcmp olt double [[TMP32]], 0x3EB0C6F7A0B5ED8D
+; AVX2-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false
+; AVX2-NEXT:    br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]]
+; AVX2:       lor.lhs.false:
+; AVX2-NEXT:    [[TMP33:%.*]] = fcmp ule <2 x double> [[TMP30]], splat (double 1.000000e+00)
+; AVX2-NEXT:    [[TMP34:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0
+; AVX2-NEXT:    [[TMP35:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1
+; AVX2-NEXT:    [[OR_COND106:%.*]] = select i1 [[TMP35]], i1 true, i1 [[TMP34]]
+; AVX2-NEXT:    [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32
+; AVX2-NEXT:    br label [[CLEANUP]]
+; AVX2:       cleanup:
+; AVX2-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 0, [[IF_END]] ], [ [[SPEC_SELECT]], [[LOR_LHS_FALSE]] ]
+; AVX2-NEXT:    ret i32 [[RETVAL_0]]
 ;
 entry:
   %dir = getelementptr inbounds %struct.ray, ptr %ray, i64 0, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
index 249b3f9329319..9fea27e4faeff 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/delayed-gather-emission.ll
@@ -12,26 +12,26 @@ define void @test() {
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTPRE_PRE:%.*]] = load float, ptr poison, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> <float poison, float undef>, float [[DOTPRE_PRE]], i32 0
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[BB2:%.*]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = phi float [ [[DOTPRE_PRE]], [[ENTRY:%.*]] ], [ [[I2:%.*]], [[BB2:%.*]] ]
+; CHECK-NEXT:    [[FOXTROT_0:%.*]] = phi float [ undef, [[ENTRY]] ], [ [[GULF_0:%.*]], [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP9:%.*]], [[BB2]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi float [ [[DOTPRE]], [[BB1]] ], [ [[I2]], [[BB2]] ]
+; CHECK-NEXT:    [[GULF_0]] = phi float [ [[FOXTROT_0]], [[BB1]] ], [ [[TMP6:%.*]], [[BB2]] ]
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr poison, align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP0]], float [[GULF_0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[GULF_0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[I1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6]] = extractelement <2 x float> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP5]], i32 1
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP6]], [[TMP7]]
 ; CHECK-NEXT:    tail call void @foo(float [[MUL]])
-; CHECK-NEXT:    [[I2:%.*]] = load float, ptr poison, align 4
+; CHECK-NEXT:    [[I2]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une float [[I2]], 0.000000e+00
-; CHECK-NEXT:    [[TMP10]] = insertelement <2 x float> [[TMP2]], float [[I2]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP9]] = insertelement <2 x float> [[TMP8]], float [[I2]], i32 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[BB1]], label [[BB2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
index bfb623ac5a9b9..ddcc29a8739d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/entry-no-bundle-but-extra-use-on-vec.ll
@@ -5,27 +5,28 @@ define void @test(ptr %nExp, float %0, i1 %cmp, float %1) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr [[NEXP:%.*]], float [[TMP0:%.*]], i1 [[CMP:%.*]], float [[TMP1:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x float> <float 0.000000e+00, float 0x7FF8000000000000, float poison, float poison>, float [[TMP1]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3
 ; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[NEXP]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[DIV_1_I_I:%.*]] = fmul float [[TMP4]], 0.000000e+00
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> <float poison, float 0.000000e+00, float 0.000000e+00, float poison>, float [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[DIV_1_I_I]], i32 0
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP11]], %[[IF_THEN]] ], [ [[TMP3]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi float [ [[TMP1]], %[[IF_THEN]] ], [ [[TMP0]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi float [ 0.000000e+00, %[[IF_THEN]] ], [ [[TMP1]], %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <2 x float> [ [[TMP8]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <2 x float> [ zeroinitializer, %[[IF_THEN]] ], [ <float 0x7FF8000000000000, float 1.000000e+00>, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x float> [ [[TMP7]], %[[IF_THEN]] ], [ zeroinitializer, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[TMP15]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x float> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP14]], <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP10]], i32 3
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <4 x float> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[CALL25:%.*]] = load volatile ptr, ptr null, align 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x float> [[TMP18]], [[TMP17]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01cf9a2c..aca0d7e1c7d8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -7,14 +7,17 @@ define i1 @test(i32 %g, i16 %d) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i16 [[D]], 1
 ; CHECK-NEXT:    [[XOR_I_I:%.*]] = xor i32 [[G]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[CONV1_I_I:%.*]] = trunc i32 [[XOR_I_I]] to i8
+; CHECK-NEXT:    [[CONV1_1_I_I:%.*]] = trunc i32 [[G]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[CONV1_1_I_I]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[CONV1_I_I]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> poison, i32 [[G]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll b/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll
index 19eb7bf4dfc94..6656f34e415a3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/original-inst-scheduled-after-copyable.ll
@@ -12,32 +12,34 @@ define void @test(ptr %0, i32 %1, i32 %2) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP7]], 1
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i32 [[ADD_NARROWED_I_I]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[TMP18]], -1
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP28]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i32> [[TMP11]], splat (i32 -2)
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 -2>, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <2 x i32> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[TMP5]], align 8
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr null)
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> <i32 1, i32 poison>, i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP33:%.*]] = and <2 x i32> [[TMP17]], [[TMP32]]
-; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr null)
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i32> [[TMP33]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
 ; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i32> [[TMP34]] to <2 x i64>
-; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i32> [[TMP33]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP35:%.*]] = shl <2 x i64> [[TMP23]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = or <2 x i64> [[TMP35]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP27:%.*]] = trunc <2 x i64> [[TMP25]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP24:%.*]] = tail call i32 asm sideeffect "", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 0)
-; CHECK-NEXT:    store <2 x i32> [[TMP16]], ptr [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i32> [[TMP32]], <2 x i32> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP30:%.*]] = and <2 x i32> [[TMP29]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = or <2 x i32> [[TMP30]], [[TMP27]]
-; CHECK-NEXT:    store <2 x i32> [[TMP31]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = trunc i64 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <2 x i32> poison, i32 [[TMP36]], i32 0
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x i32> [[TMP37]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = and <2 x i32> [[TMP38]], splat (i32 -2)
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x i32> <i32 poison, i32 -2>, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or <2 x i32> [[TMP28]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = xor <2 x i32> [[TMP28]], [[TMP39]]
+; CHECK-NEXT:    [[TMP42:%.*]] = shufflevector <2 x i32> [[TMP40]], <2 x i32> [[TMP41]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP44:%.*]] = shufflevector <2 x i32> [[TMP31]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP43]], <4 x i32> [[TMP44]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    store <4 x i32> [[TMP45]], ptr [[TMP3]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
index af165de293005..a9e9ff14a5202 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-with-cycle.ll
@@ -4,21 +4,22 @@
 define void @test(float %0) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: float [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv float 0.000000e+00, 0.000000e+00
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fdiv <2 x float> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fdiv <2 x float> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    br label %[[BB6:.*]]
-; CHECK:       [[BB6]]:
+; CHECK-NEXT:    br label %[[BB5:.*]]
+; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x float> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x float> zeroinitializer, [[TMP7]]
-; CHECK-NEXT:    br label %[[BB10:.*]]
-; CHECK:       [[BB9:.*]]:
-; CHECK-NEXT:    br label %[[BB10]]
-; CHECK:       [[BB10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB6]] ], [ poison, %[[BB9]] ]
-; CHECK-NEXT:    br label %[[BB12:.*]]
-; CHECK:       [[BB12]]:
+; CHECK-NEXT:    br label %[[BB9:.*]]
+; CHECK:       [[BB8:.*]]:
+; CHECK-NEXT:    br label %[[BB9]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x float> [ [[TMP8]], %[[BB5]] ], [ poison, %[[BB8]] ]
+; CHECK-NEXT:    br label %[[BB11:.*]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul <2 x float> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x float> [[TMP11]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 9c8ba07734b87..d3919f6883950 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s
-; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2   -slp-threshold=-1 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE2
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE42
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx    | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2   | FileCheck %s --check-prefixes=CHECK,CHECK-AVX2
+; RUN: opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: define void @store_i32(
@@ -102,20 +102,99 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 }
 
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
-; CHECK-LABEL: define void @store_i64(
-; CHECK-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
-; CHECK-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
-; CHECK-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
-; CHECK-NEXT:    ret void
+; CHECK-SSE2-LABEL: define void @store_i64(
+; CHECK-SSE2-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-SSE2-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-SSE2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
+; CHECK-SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; CHECK-SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-SSE2-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-SSE2-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
+; CHECK-SSE2-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-SSE2-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
+; CHECK-SSE2-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-SSE2-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
+; CHECK-SSE2-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-SSE2-NEXT:    ret void
+;
+; CHECK-SSE42-LABEL: define void @store_i64(
+; CHECK-SSE42-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-SSE42-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-SSE42-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
+; CHECK-SSE42-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; CHECK-SSE42-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-SSE42-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-SSE42-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
+; CHECK-SSE42-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-SSE42-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
+; CHECK-SSE42-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-SSE42-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
+; CHECK-SSE42-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-SSE42-NEXT:    ret void
+;
+; CHECK-AVX-LABEL: define void @store_i64(
+; CHECK-AVX-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-AVX-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 1
+; CHECK-AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; CHECK-AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 3
+; CHECK-AVX-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
+; CHECK-AVX-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP6]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], [[TMP4]]
+; CHECK-AVX-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP4]]
+; CHECK-AVX-NEXT:    [[TMP12:%.*]] = lshr i64 [[TMP10]], 15
+; CHECK-AVX-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 15
+; CHECK-AVX-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP12]] to i32
+; CHECK-AVX-NEXT:    [[TMP15:%.*]] = trunc i64 [[TMP13]] to i32
+; CHECK-AVX-NEXT:    [[TMP16:%.*]] = load i64, ptr [[TMP5]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX-NEXT:    [[TMP17:%.*]] = load i64, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP4]]
+; CHECK-AVX-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP17]], [[TMP4]]
+; CHECK-AVX-NEXT:    [[TMP20:%.*]] = insertelement <2 x i64> poison, i64 [[TMP19]], i32 0
+; CHECK-AVX-NEXT:    [[TMP21:%.*]] = insertelement <2 x i64> [[TMP20]], i64 [[TMP18]], i32 1
+; CHECK-AVX-NEXT:    [[TMP22:%.*]] = lshr <2 x i64> [[TMP21]], splat (i64 15)
+; CHECK-AVX-NEXT:    [[TMP23:%.*]] = trunc <2 x i64> [[TMP22]] to <2 x i32>
+; CHECK-AVX-NEXT:    [[TMP24:%.*]] = shufflevector <2 x i32> [[TMP23]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-AVX-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP24]], i32 [[TMP15]], i32 2
+; CHECK-AVX-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP25]], i32 [[TMP14]], i32 3
+; CHECK-AVX-NEXT:    [[TMP27:%.*]] = icmp ult <4 x i32> [[TMP26]], splat (i32 255)
+; CHECK-AVX-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i64> [[TMP22]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-AVX-NEXT:    [[TMP29:%.*]] = insertelement <4 x i64> [[TMP28]], i64 [[TMP13]], i32 2
+; CHECK-AVX-NEXT:    [[TMP30:%.*]] = insertelement <4 x i64> [[TMP29]], i64 [[TMP12]], i32 3
+; CHECK-AVX-NEXT:    [[TMP31:%.*]] = and <4 x i64> [[TMP30]], splat (i64 4294967295)
+; CHECK-AVX-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP27]], <4 x i64> [[TMP31]], <4 x i64> splat (i64 255)
+; CHECK-AVX-NEXT:    store <4 x i64> [[TMP32]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX-NEXT:    ret void
+;
+; CHECK-AVX2-LABEL: define void @store_i64(
+; CHECK-AVX2-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-AVX2-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-AVX2-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
+; CHECK-AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; CHECK-AVX2-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-AVX2-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-AVX2-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
+; CHECK-AVX2-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-AVX2-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
+; CHECK-AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-AVX2-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
+; CHECK-AVX2-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX2-NEXT:    ret void
+;
+; CHECK-AVX512-LABEL: define void @store_i64(
+; CHECK-AVX512-SAME: ptr captures(none) [[TMP0:%.*]], i32 [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR0]] {
+; CHECK-AVX512-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-AVX512-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5:![0-9]+]]
+; CHECK-AVX512-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+; CHECK-AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-AVX512-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; CHECK-AVX512-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], splat (i64 15)
+; CHECK-AVX512-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; CHECK-AVX512-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], splat (i32 255)
+; CHECK-AVX512-NEXT:    [[TMP12:%.*]] = and <4 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-AVX512-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i64> [[TMP12]], <4 x i64> splat (i64 255)
+; CHECK-AVX512-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP0]], align 8, !tbaa [[LONG_TBAA5]]
+; CHECK-AVX512-NEXT:    ret void
 ;
   %4 = zext i32 %1 to i64
   %5 = load i64, ptr %0, align 8, !tbaa !7
@@ -164,11 +243,43 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 !7 = !{!8, !8, i64 0}
 !8 = !{!"long", !4, i64 0}
 ;.
-; CHECK: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
-; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
-; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
-; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
-; CHECK: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
-; CHECK: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
-; CHECK: [[META6]] = !{!"long", [[META2]], i64 0}
+; CHECK-SSE2: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-SSE2: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK-SSE2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK-SSE2: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK-SSE2: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK-SSE2: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-SSE2: [[META6]] = !{!"long", [[META2]], i64 0}
+;.
+; CHECK-SSE42: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-SSE42: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK-SSE42: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK-SSE42: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK-SSE42: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK-SSE42: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-SSE42: [[META6]] = !{!"long", [[META2]], i64 0}
+;.
+; CHECK-AVX: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-AVX: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK-AVX: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK-AVX: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK-AVX: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK-AVX: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-AVX: [[META6]] = !{!"long", [[META2]], i64 0}
+;.
+; CHECK-AVX2: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-AVX2: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK-AVX2: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK-AVX2: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK-AVX2: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK-AVX2: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-AVX2: [[META6]] = !{!"long", [[META2]], i64 0}
+;.
+; CHECK-AVX512: [[INT_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
+; CHECK-AVX512: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0}
+; CHECK-AVX512: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
+; CHECK-AVX512: [[META3]] = !{!"Simple C++ TBAA"}
+; CHECK-AVX512: [[CHAR_TBAA4]] = !{[[META2]], [[META2]], i64 0}
+; CHECK-AVX512: [[LONG_TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK-AVX512: [[META6]] = !{!"long", [[META2]], i64 0}
 ;.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
index 29a8a229980e9..bd804c46ebabc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll
@@ -87,11 +87,10 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[ADD:%.*]] = fsub double [[C:%.*]], [[B]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[FNEG]], [[C]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
@@ -136,11 +135,10 @@ define i1 @fcmp_lt(double %a, double %b, double %c) {
 ; CHECK-LABEL: @fcmp_lt(
 ; CHECK-NEXT:    [[FNEG:%.*]] = fneg double [[B:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fsub double [[C:%.*]], [[B]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub double [[FNEG]], [[C]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[SUB]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP1]], double [[ADD]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fdiv <2 x double> [[TMP5]], [[TMP7]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a63723e..a63b34cb3079c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -4,36 +4,32 @@
 define i16 @test() {
 ; CHECK-LABEL: define i16 @test() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 0, 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 0
 ; CHECK-NEXT:    [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 0, 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT:    [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 0, 0
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i32> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and i32 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = lshr i32 0, 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 3, i32 poison, i32 4, i32 5, i32 6, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 26, i32 5, i32 28, i32 7, i32 30, i32 31, i32 32, i32 11, i32 34, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <24 x i32> [[TMP17]], i32 [[TMP9]], i32 13
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
+; CHECK-NEXT:    [[TMP35:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> poison, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP25]], <24 x i32> [[TMP36]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 30, i32 12, i32 13, i32 14, i32 poison, i32 16, i32 poison, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll
index ef75a8dd99169..210f59688d59e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-last-instruction-different-parents.ll
@@ -4,26 +4,27 @@
 define i32 @test(i32 %0, i1 %1) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: i32 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP0]] to double
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
 ; CHECK-NEXT:    [[TMP6:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double>
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[BB7:.*]], label %[[BB9:.*]]
 ; CHECK:       [[BB7]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB16:.*]]
+; CHECK-NEXT:    br label %[[BB17:.*]]
 ; CHECK:       [[BB9]]:
-; CHECK-NEXT:    br i1 false, label %[[BB14:.*]], label %[[BB10:.*]]
+; CHECK-NEXT:    br i1 false, label %[[BB15:.*]], label %[[BB10:.*]]
 ; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> zeroinitializer)
-; CHECK-NEXT:    br label %[[BB14]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <2 x double> [ [[TMP13]], %[[BB10]] ], [ zeroinitializer, %[[BB9]] ]
-; CHECK-NEXT:    br label %[[BB16]]
-; CHECK:       [[BB16]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x double> [ [[TMP15]], %[[BB14]] ], [ [[TMP8]], %[[BB7]] ]
+; CHECK-NEXT:    br label %[[BB15]]
+; CHECK:       [[BB15]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x double> [ [[TMP13]], %[[BB10]] ], [ zeroinitializer, %[[BB9]] ]
+; CHECK-NEXT:    br label %[[BB17]]
+; CHECK:       [[BB17]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <2 x double> [ [[TMP16]], %[[BB15]] ], [ [[TMP8]], %[[BB7]] ]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP17]], i32 0
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP17]], i32 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = fmul double [[TMP19]], [[TMP18]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll
index 0f9b2e9ba86fd..e67589426bcc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scalarize-ctlz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 %s | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 %s | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64 %s | FileCheck %s --check-prefixes=SSE
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v2 %s | FileCheck %s --check-prefixes=SSE
 ; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v3 %s | FileCheck %s --check-prefixes=AVX2
 ; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-- -mcpu=x86-64-v4 %s | FileCheck %s --check-prefixes=AVX512
 
@@ -95,52 +95,33 @@ define <4 x i64> @scalarize_ctlz_v4i64(<4 x i64> %v)  {
 }
 
 define <8 x i64> @scalarize_ctlz_v8i64(<8 x i64> %v)  {
-; SSE2-LABEL: define <8 x i64> @scalarize_ctlz_v8i64(
-; SSE2-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] {
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> <i32 0, i32 1>
-; SSE2-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false)
-; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> <i32 2, i32 3>
-; SSE2-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP3]], i1 false)
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> <i32 4, i32 5>
-; SSE2-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP5]], i1 false)
-; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i64> [[V]], <8 x i64> poison, <2 x i32> <i32 6, i32 7>
-; SSE2-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP7]], i1 false)
-; SSE2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[R31:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[R52:%.*]] = shufflevector <8 x i64> [[R31]], <8 x i64> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[R73:%.*]] = shufflevector <8 x i64> [[R52]], <8 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SSE2-NEXT:    ret <8 x i64> [[R73]]
-;
-; SSE4-LABEL: define <8 x i64> @scalarize_ctlz_v8i64(
-; SSE4-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] {
-; SSE4-NEXT:    [[V0:%.*]] = extractelement <8 x i64> [[V]], i64 0
-; SSE4-NEXT:    [[V1:%.*]] = extractelement <8 x i64> [[V]], i64 1
-; SSE4-NEXT:    [[V2:%.*]] = extractelement <8 x i64> [[V]], i64 2
-; SSE4-NEXT:    [[V3:%.*]] = extractelement <8 x i64> [[V]], i64 3
-; SSE4-NEXT:    [[V4:%.*]] = extractelement <8 x i64> [[V]], i64 4
-; SSE4-NEXT:    [[V5:%.*]] = extractelement <8 x i64> [[V]], i64 5
-; SSE4-NEXT:    [[V6:%.*]] = extractelement <8 x i64> [[V]], i64 6
-; SSE4-NEXT:    [[V7:%.*]] = extractelement <8 x i64> [[V]], i64 7
-; SSE4-NEXT:    [[C0:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V0]], i1 false)
-; SSE4-NEXT:    [[C1:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V1]], i1 false)
-; SSE4-NEXT:    [[C2:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V2]], i1 false)
-; SSE4-NEXT:    [[C3:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V3]], i1 false)
-; SSE4-NEXT:    [[C4:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V4]], i1 false)
-; SSE4-NEXT:    [[C5:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V5]], i1 false)
-; SSE4-NEXT:    [[C6:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V6]], i1 false)
-; SSE4-NEXT:    [[C7:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V7]], i1 false)
-; SSE4-NEXT:    [[R0:%.*]] = insertelement <8 x i64> poison, i64 [[C0]], i64 0
-; SSE4-NEXT:    [[R1:%.*]] = insertelement <8 x i64> [[R0]], i64 [[C1]], i64 1
-; SSE4-NEXT:    [[R2:%.*]] = insertelement <8 x i64> [[R1]], i64 [[C2]], i64 2
-; SSE4-NEXT:    [[R3:%.*]] = insertelement <8 x i64> [[R2]], i64 [[C3]], i64 3
-; SSE4-NEXT:    [[R4:%.*]] = insertelement <8 x i64> [[R3]], i64 [[C4]], i64 4
-; SSE4-NEXT:    [[R5:%.*]] = insertelement <8 x i64> [[R4]], i64 [[C5]], i64 5
-; SSE4-NEXT:    [[R6:%.*]] = insertelement <8 x i64> [[R5]], i64 [[C6]], i64 6
-; SSE4-NEXT:    [[R7:%.*]] = insertelement <8 x i64> [[R6]], i64 [[C7]], i64 7
-; SSE4-NEXT:    ret <8 x i64> [[R7]]
+; SSE-LABEL: define <8 x i64> @scalarize_ctlz_v8i64(
+; SSE-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[V0:%.*]] = extractelement <8 x i64> [[V]], i64 0
+; SSE-NEXT:    [[V1:%.*]] = extractelement <8 x i64> [[V]], i64 1
+; SSE-NEXT:    [[V2:%.*]] = extractelement <8 x i64> [[V]], i64 2
+; SSE-NEXT:    [[V3:%.*]] = extractelement <8 x i64> [[V]], i64 3
+; SSE-NEXT:    [[V4:%.*]] = extractelement <8 x i64> [[V]], i64 4
+; SSE-NEXT:    [[V5:%.*]] = extractelement <8 x i64> [[V]], i64 5
+; SSE-NEXT:    [[V6:%.*]] = extractelement <8 x i64> [[V]], i64 6
+; SSE-NEXT:    [[V7:%.*]] = extractelement <8 x i64> [[V]], i64 7
+; SSE-NEXT:    [[C0:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V0]], i1 false)
+; SSE-NEXT:    [[C1:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V1]], i1 false)
+; SSE-NEXT:    [[C2:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V2]], i1 false)
+; SSE-NEXT:    [[C3:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V3]], i1 false)
+; SSE-NEXT:    [[C4:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V4]], i1 false)
+; SSE-NEXT:    [[C5:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V5]], i1 false)
+; SSE-NEXT:    [[C6:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V6]], i1 false)
+; SSE-NEXT:    [[C7:%.*]] = tail call range(i64 0, 65) i64 @llvm.ctlz.i64(i64 [[V7]], i1 false)
+; SSE-NEXT:    [[R0:%.*]] = insertelement <8 x i64> poison, i64 [[C0]], i64 0
+; SSE-NEXT:    [[R1:%.*]] = insertelement <8 x i64> [[R0]], i64 [[C1]], i64 1
+; SSE-NEXT:    [[R2:%.*]] = insertelement <8 x i64> [[R1]], i64 [[C2]], i64 2
+; SSE-NEXT:    [[R3:%.*]] = insertelement <8 x i64> [[R2]], i64 [[C3]], i64 3
+; SSE-NEXT:    [[R4:%.*]] = insertelement <8 x i64> [[R3]], i64 [[C4]], i64 4
+; SSE-NEXT:    [[R5:%.*]] = insertelement <8 x i64> [[R4]], i64 [[C5]], i64 5
+; SSE-NEXT:    [[R6:%.*]] = insertelement <8 x i64> [[R5]], i64 [[C6]], i64 6
+; SSE-NEXT:    [[R7:%.*]] = insertelement <8 x i64> [[R6]], i64 [[C7]], i64 7
+; SSE-NEXT:    ret <8 x i64> [[R7]]
 ;
 ; AVX2-LABEL: define <8 x i64> @scalarize_ctlz_v8i64(
 ; AVX2-SAME: <8 x i64> [[V:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
index cfff11758a37a..2b3f00dc21769 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll
@@ -6,11 +6,17 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-SAME: i32 [[TMP0:%.*]], i8 [[TMP1:%.*]], i64 [[TMP2:%.*]], float [[TMP3:%.*]]) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 255, i64 -65536>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> <i64 1, i64 poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP9]], <i64 1, i64 16>
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc <2 x i64> [[TMP10]] to <2 x i8>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], <i64 -65536, i64 255>
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP52:%.*]] = add i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP53:%.*]] = lshr i64 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP58:%.*]] = lshr i64 [[TMP52]], 1
+; CHECK-NEXT:    [[TMP90:%.*]] = trunc i64 [[TMP53]] to i8
+; CHECK-NEXT:    [[TMP91:%.*]] = trunc i64 [[TMP58]] to i8
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <2 x i8> poison, i8 [[TMP91]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i8> [[TMP92]], i8 [[TMP90]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[TMP11]], <2 x i8> zeroinitializer)
 ; CHECK-NEXT:    [[TMP13:%.*]] = uitofp <2 x i8> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
@@ -25,7 +31,7 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = sub <2 x i32> zeroinitializer, [[TMP22]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = ashr <2 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = sitofp <2 x i32> [[TMP24]] to <2 x float>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x float> [[TMP26]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP51:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x float> [[TMP27]], <8 x float> [[TMP51]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
@@ -51,12 +57,12 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP47]], i64 0, i64 8388608
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[TMP32]], i32 1
 ; CHECK-NEXT:    [[TMP50:%.*]] = select i1 [[TMP49]], i64 0, i64 32768
-; CHECK-NEXT:    br label %[[BB53:.*]]
-; CHECK:       [[BB52:.*]]:
+; CHECK-NEXT:    br label %[[BB59:.*]]
+; CHECK:       [[BB58:.*]]:
 ; CHECK-NEXT:    unreachable
-; CHECK:       [[BB53]]:
-; CHECK-NEXT:    br label %[[BB54:.*]]
-; CHECK:       [[BB54]]:
+; CHECK:       [[BB59]]:
+; CHECK-NEXT:    br label %[[BB60:.*]]
+; CHECK:       [[BB60]]:
 ; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]])
 ; CHECK-NEXT:    [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
@@ -96,7 +102,7 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) {
 ; CHECK-NEXT:    [[TMP85:%.*]] = or i64 [[TMP84]], [[TMP48]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = or i64 [[TMP85]], [[TMP81]]
 ; CHECK-NEXT:    store i64 [[TMP86]], ptr null, align 1
-; CHECK-NEXT:    br label %[[BB52]]
+; CHECK-NEXT:    br label %[[BB58]]
 ;
   %5 = and i64 %2, 255
   %6 = and i64 %2, -65536
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
index 3bafc3c6552f2..252746b465bc6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
@@ -5,14 +5,20 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) {
 ; CHECK-LABEL: define i1 @test(
 ; CHECK-SAME: i64 [[V1:%.*]], ptr [[V2:%.*]], i32 [[V3:%.*]], i1 [[V4:%.*]]) {
 ; CHECK-NEXT:  [[NEWFUNCROOT:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 32, i64 40>
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = and <2 x i8> [[TMP3]], <i8 1, i8 -1>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[V1]], 40
+; CHECK-NEXT:    [[TT3:%.*]] = lshr i64 [[V1]], 32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TT3]] to i32
+; CHECK-NEXT:    [[TT2:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[TT1:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TT1]] to i8
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i8> poison, i8 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc i32 [[TT2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> [[TMP7]], i8 [[TMP33]], i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TT1]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TT2]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0
 ; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>
 ; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll
index ffeb8045dea7e..6623e9b8ecc84 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/trunced-buildvector-scalar-extended.ll
@@ -7,8 +7,15 @@ define <4 x float> @test(i64 %0) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> <i64 0, i64 0, i64 poison, i64 0>, i64 [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float>
+; CHECK-NEXT:    [[TMP11:%.*]] = sitofp i64 0 to float
+; CHECK-NEXT:    [[TMP12:%.*]] = sitofp i64 0 to float
+; CHECK-NEXT:    [[TMP13:%.*]] = sitofp i64 [[TMP0]] to float
+; CHECK-NEXT:    [[TMP14:%.*]] = sitofp i64 0 to float
 ; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP14]], i32 3
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x float> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    ret <4 x float> [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
index 4d1f6a1aa074b..5129411196e03 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
@@ -446,12 +446,11 @@ define void @reuse_shuffle_indices_cost_crash_3(ptr %m, double %conv, double %co
 ; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB19:%.*]] = fsub double 0.000000e+00, [[CONV2]]
-; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
-; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 0.000000e+00
-; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
-; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB19]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[M]], align 4
 ; CHECK-NEXT:    [[ADD26:%.*]] = fsub double [[CONV]], [[CONV]]
 ; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
 ; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
@@ -522,12 +521,11 @@ define void @common_mask(ptr %m, double %conv, double %conv2) {
 ; CHECK-SAME: ptr [[M:%.*]], double [[CONV:%.*]], double [[CONV2:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB19:%.*]] = fsub double [[CONV]], [[CONV]]
-; CHECK-NEXT:    [[CONV20:%.*]] = fptrunc double [[SUB19]] to float
-; CHECK-NEXT:    store float [[CONV20]], ptr [[M]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[CONV2]], 0.000000e+00
-; CHECK-NEXT:    [[CONV239:%.*]] = fptrunc double [[ADD]] to float
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 1
-; CHECK-NEXT:    store float [[CONV239]], ptr [[ARRAYIDX25]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[SUB19]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[ADD]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[M]], align 4
 ; CHECK-NEXT:    [[ADD26:%.*]] = fsub double 0.000000e+00, [[CONV]]
 ; CHECK-NEXT:    [[CONV27:%.*]] = fptrunc double [[ADD26]] to float
 ; CHECK-NEXT:    [[ARRAYIDX29:%.*]] = getelementptr [4 x float], ptr [[M]], i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll b/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
index c1ec9b8eeadff..840767ac511a5 100644
--- a/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
+++ b/llvm/test/Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
@@ -1,21 +1,34 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=CHECK,AARCH64 %}
 ; Vectorization tree roots at vector build sequence (insertelement),
 ; SLP crashed on generating vector code for pair {%i4, 0.0} trying to produce
 ; a shuffle with %ins1 as a source because it was marked deleted
 ; due to vectorization.
 
 define void @test() {
-; CHECK-LABEL: define void @test() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP3]] = shufflevector <2 x float> [[TMP2]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    br label [[LOOP]]
+; X86-LABEL: define void @test() {
+; X86-NEXT:  entry:
+; X86-NEXT:    br label [[LOOP:%.*]]
+; X86:       loop:
+; X86-NEXT:    [[PH0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[I4:%.*]], [[LOOP]] ]
+; X86-NEXT:    [[PH1:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[LOOP]] ]
+; X86-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> poison, float [[PH0]], i32 0
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> [[TMP0]], float [[PH1]], i32 1
+; X86-NEXT:    [[TMP2:%.*]] = fadd <2 x float> zeroinitializer, [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP2]], <2 x float> zeroinitializer
+; X86-NEXT:    [[I4]] = extractelement <2 x float> [[TMP3]], i64 0
+; X86-NEXT:    br label [[LOOP]]
+;
+; AARCH64-LABEL: define void @test() {
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    br label [[LOOP:%.*]]
+; AARCH64:       loop:
+; AARCH64-NEXT:    [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[LOOP]] ]
+; AARCH64-NEXT:    [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[TMP0]]
+; AARCH64-NEXT:    [[TMP2:%.*]] = select <2 x i1> zeroinitializer, <2 x float> [[TMP1]], <2 x float> zeroinitializer
+; AARCH64-NEXT:    [[TMP3]] = shufflevector <2 x float> [[TMP2]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    br label [[LOOP]]
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll
index f0f8377d637f9..13d47e5c11181 100644
--- a/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/vectorize-reorder-alt-shuffle.ll
@@ -1,23 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define void @foo(ptr %c, ptr %d) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 1
-; CHECK-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[ARRAYIDX4]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float>
-; CHECK-NEXT:    [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], undef
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
-; CHECK-NEXT:    store <4 x float> [[TMP9]], ptr [[ADD_PTR53]], align 4
-; CHECK-NEXT:    ret void
+; X86-LABEL: @foo(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 4
+; X86-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 1
+; X86-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 2
+; X86-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4
+; X86-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; X86-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; X86-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP0]] to i32
+; X86-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP1]] to i32
+; X86-NEXT:    [[SHL6:%.*]] = shl nuw nsw i32 [[CONV5]], 2
+; X86-NEXT:    [[AND:%.*]] = and i32 [[CONV2]], 3
+; X86-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[ARRAYIDX12]], align 1
+; X86-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i16>
+; X86-NEXT:    [[TMP4:%.*]] = shl <2 x i16> [[TMP3]], splat (i16 2)
+; X86-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[SHL6]], i32 0
+; X86-NEXT:    [[TMP6:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; X86-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[AND]], i32 3
+; X86-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> undef, [[TMP9]]
+; X86-NEXT:    [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x float>
+; X86-NEXT:    [[TMP12:%.*]] = fdiv <4 x float> [[TMP11]], undef
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; X86-NEXT:    store <4 x float> [[TMP13]], ptr [[ADD_PTR53]], align 4
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @foo(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 1
+; AARCH64-NEXT:    [[ADD_PTR53:%.*]] = getelementptr inbounds float, ptr [[D:%.*]], i64 -4
+; AARCH64-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[ARRAYIDX4]], align 1
+; AARCH64-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
+; AARCH64-NEXT:    [[TMP2:%.*]] = shl nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 3>
+; AARCH64-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AARCH64-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> undef, [[TMP4]]
+; AARCH64-NEXT:    [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float>
+; AARCH64-NEXT:    [[TMP7:%.*]] = fdiv <4 x float> [[TMP6]], undef
+; AARCH64-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+; AARCH64-NEXT:    store <4 x float> [[TMP8]], ptr [[ADD_PTR53]], align 4
+; AARCH64-NEXT:    ret void
 ;
 entry:
   %arrayidx1 = getelementptr inbounds i8, ptr %c, i64 4

From 8e7ac4acc906e2ad9f5a73b4218f6d70a02431d7 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Sun, 5 Oct 2025 07:55:43 -0700
Subject: [PATCH 2/2] Fix formatting

Created using spr 1.3.7
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b633dd4d9fdb0..95e4c7781800d 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16129,10 +16129,11 @@ InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
 
   LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
   Cost = 0;
-  for (const auto &P : NodesCosts){
+  for (const auto &P : NodesCosts) {
     Cost += P.second;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
-                      << shortBundleName(P.first->Scalars, P.first->Idx) << ".\n"
+                      << shortBundleName(P.first->Scalars, P.first->Idx)
+                      << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
   }
   return Cost;
@@ -17897,7 +17898,7 @@ Value *BoUpSLP::gather(
       ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
       const auto *It = find_if(Entries, [&](const TreeEntry *E) {
         return !TransformedToGatherNodes.contains(E) &&
-           !DeletedNodes.contains(E);
+               !DeletedNodes.contains(E);
       });
       if (It != Entries.end()) {
         // Find which lane we need to extract.