diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b2124c6106198..feb0af61b15c2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -800,6 +800,12 @@ class TargetTransformInfo {
   /// Return true if the target supports strided load.
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
 
+  /// Return true is the target supports interleaved access for the given vector
+  /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
+  /// address space \p AddrSpace.
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) const;
+
   // Return true if the target supports masked vector histograms.
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const;
 
@@ -1906,6 +1912,10 @@ class TargetTransformInfo::Concept {
   virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                            Align Alignment,
+                                            unsigned AddrSpace) = 0;
+
   virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0;
   virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
                                unsigned Opcode1,
@@ -2417,6 +2427,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalStridedLoadStore(DataType, Alignment);
   }
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment,
+                                    unsigned AddrSpace) override {
+    return Impl.isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace);
+  }
+
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override {
     return Impl.isLegalMaskedVectorHistogram(AddrType, DataType);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 11b07ac0b7fc4..2ed9a1d583d89 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -321,6 +321,11 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) {
+    return false;
+  }
+
   bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const {
     return false;
   }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 2c26493bd3f1c..be8bf762e12e1 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -517,6 +517,13 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
   return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalInterleavedAccessType(
+    VectorType *VTy, unsigned Factor, Align Alignment,
+    unsigned AddrSpace) const {
+  return TTIImpl->isLegalInterleavedAccessType(VTy, Factor, Alignment,
+                                               AddrSpace);
+}
+
 bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType,
                                                        Type *DataType) const {
   return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType);
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index cc69e1d118b5a..1ce80fe8ed7e1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -281,6 +281,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment);
   }
 
+  bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
+                                    Align Alignment, unsigned AddrSpace) {
+    return TLI->isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace,
+                                             DL);
+  }
+
   bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment);
 
   bool isVScaleKnownToBeAPowerOfTwo() const {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 949579772b94d..c07a39fe0dbc6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1336,6 +1336,8 @@ class BoUpSLP {
     MustGather.clear();
     NonScheduledFirst.clear();
     EntryToLastInstruction.clear();
+    LoadEntriesToVectorize.clear();
+    GatheredLoadsEntriesFirst = NoGatheredLoads;
     ExternalUses.clear();
     ExternalUsesAsOriginalScalar.clear();
     for (auto &Iter : BlocksSchedules) {
@@ -1352,7 +1354,11 @@ class BoUpSLP {
     ValueToGatherNodes.clear();
   }
 
-  unsigned getTreeSize() const { return VectorizableTree.size(); }
+  unsigned getTreeSize() const {
+    return GatheredLoadsEntriesFirst == NoGatheredLoads
+               ? VectorizableTree.size()
+               : GatheredLoadsEntriesFirst;
+  }
 
   /// Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
@@ -1460,11 +1466,14 @@ class BoUpSLP {
   /// \param VL0 main load value.
   /// \param Order returned order of load instructions.
   /// \param PointerOps returned list of pointer operands.
+  /// \param BestVF return best vector factor, if recursive check found better
+  /// vectorization sequences rather than masked gather.
   /// \param TryRecursiveCheck used to check if long masked gather can be
   /// represented as a serie of loads/insert subvector, if profitable.
   LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
                                SmallVectorImpl<unsigned> &Order,
                                SmallVectorImpl<Value *> &PointerOps,
+                               unsigned *BestVF = nullptr,
                                bool TryRecursiveCheck = true) const;
 
   OptimizationRemarkEmitter *getORE() { return ORE; }
@@ -2827,7 +2836,7 @@ class BoUpSLP {
 
   /// This is the recursive part of buildTree.
   void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
-                     const EdgeInfo &EI);
+                     const EdgeInfo &EI, unsigned InterleaveFactor = 0);
 
   /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a
@@ -2939,6 +2948,12 @@ class BoUpSLP {
   /// be beneficial even the tree height is tiny.
   bool isFullyVectorizableTinyTree(bool ForReduction) const;
 
+  /// Run through the list of all gathered loads in the graph and try to find
+  /// vector loads/masked gathers instead of regular gathers. Later these loads
+  /// are reshufled to build final gathered nodes.
+  void tryToVectorizeGatheredLoads(
+      ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads);
+
   /// Reorder commutative or alt operands to get better probability of
   /// generating vectorized code.
   static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
@@ -3011,7 +3026,8 @@ class BoUpSLP {
     }
 
     bool isOperandGatherNode(const EdgeInfo &UserEI) const {
-      return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
+      return isGather() && !UserTreeIndices.empty() &&
+             UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
              UserTreeIndices.front().UserTE == UserEI.UserTE;
     }
 
@@ -3115,7 +3131,19 @@ class BoUpSLP {
     Instruction *MainOp = nullptr;
     Instruction *AltOp = nullptr;
 
+    /// Interleaving factor for interleaved loads Vectorize nodes.
+    unsigned InterleaveFactor = 0;
+
   public:
+    /// Returns interleave factor for interleave nodes.
+    std::optional<unsigned> getInterleaveFactor() const {
+      if (InterleaveFactor > 0)
+        return InterleaveFactor;
+      return std::nullopt;
+    }
+    /// Sets interleaving factor for the interleaving nodes.
+    void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
+
     /// Set this bundle's \p OpIdx'th operand to \p OpVL.
     void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
       if (Operands.size() < OpIdx + 1)
@@ -3260,7 +3288,12 @@ class BoUpSLP {
       dbgs() << "State: ";
       switch (State) {
       case Vectorize:
-        dbgs() << "Vectorize\n";
+        if (InterleaveFactor > 0) {
+          dbgs() << "Vectorize with interleave factor " << InterleaveFactor
+                 << "\n";
+        } else {
+          dbgs() << "Vectorize\n";
+        }
         break;
       case ScatterVectorize:
         dbgs() << "ScatterVectorize\n";
@@ -3330,11 +3363,15 @@ class BoUpSLP {
                           const InstructionsState &S,
                           const EdgeInfo &UserTreeIdx,
                           ArrayRef<int> ReuseShuffleIndices = std::nullopt,
-                          ArrayRef<unsigned> ReorderIndices = std::nullopt) {
+                          ArrayRef<unsigned> ReorderIndices = std::nullopt,
+                          unsigned InterleaveFactor = 0) {
     TreeEntry::EntryState EntryState =
         Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
-    return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
-                        ReuseShuffleIndices, ReorderIndices);
+    TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+                                ReuseShuffleIndices, ReorderIndices);
+    if (E && InterleaveFactor)
+      E->setInterleave(InterleaveFactor);
+    return E;
   }
 
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
@@ -3347,6 +3384,12 @@ class BoUpSLP {
     assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
             (Bundle && EntryState != TreeEntry::NeedToGather)) &&
            "Need to vectorize gather entry?");
+    // Gathered loads still gathered? Do not create entry, use the original one.
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+        EntryState == TreeEntry::NeedToGather &&
+        S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
+        !UserTreeIdx.UserTE)
+      return nullptr;
     VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
     TreeEntry *Last = VectorizableTree.back().get();
     Last->Idx = VectorizableTree.size() - 1;
@@ -3456,7 +3499,7 @@ class BoUpSLP {
   /// and fills required data before actual scheduling of the instructions.
   TreeEntry::EntryState getScalarsVectorizationState(
       InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
-      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
+      OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps);
 
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
@@ -3491,6 +3534,14 @@ class BoUpSLP {
       DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
   ValueToGatherNodesMap ValueToGatherNodes;
 
+  /// A list of the loads, which can be vectorized using strided or masked
+  /// gather approach, but attempted to be represented as interleaved loads.
+  SetVector<unsigned> LoadEntriesToVectorize;
+
+  /// The index of the first gathered load entry in the VectorizeTree.
+  constexpr static int NoGatheredLoads = -1;
+  int GatheredLoadsEntriesFirst = NoGatheredLoads;
+
   /// This POD struct describes one external user in the vectorized tree.
   struct ExternalUser {
     ExternalUser(Value *S, llvm::User *U, int L)
@@ -4662,15 +4713,19 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
   return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
 }
 
-BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
-    ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
-    SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
+BoUpSLP::LoadsState
+BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
+                           SmallVectorImpl<unsigned> &Order,
+                           SmallVectorImpl<Value *> &PointerOps,
+                           unsigned *BestVF, bool TryRecursiveCheck) const {
   // Check that a vectorized load would load the same memory as a scalar
   // load. For example, we don't want to vectorize loads that are smaller
   // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
   // treats loading/storing it as an i8 struct. If we vectorize loads/stores
   // from such a struct, we read/write packed bits disagreeing with the
   // unvectorized version.
+  if (BestVF)
+    *BestVF = 0;
   Type *ScalarTy = VL0->getType();
 
   if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
@@ -4780,25 +4835,91 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
         }
       }
     }
-    auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
+    // Correctly identify compare the cost of loads + shuffles rather than
+    // strided/masked gather loads. Returns true if vectorized + shuffles
+    // representation is better than just gather.
+    auto CheckForShuffledLoads = [&, &TTI = *TTI](
+                                     Align CommonAlignment, unsigned *BestVF,
+                                     bool ProfitableGatherPointers) {
+      // Compare masked gather cost and loads + insert subvector costs.
+      if (BestVF)
+        *BestVF = 0;
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+      auto [ScalarGEPCost, VectorGEPCost] =
+          getGEPCosts(TTI, PointerOps, PointerOps.front(),
+                      Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
+      // Estimate the cost of masked gather GEP. If not a splat, roughly
+      // estimate as a buildvector, otherwise estimate as splat.
+      if (static_cast<unsigned>(
+              count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
+              PointerOps.size() - 1 ||
+          any_of(PointerOps, [&](Value *V) {
+            return getUnderlyingObject(V) !=
+                   getUnderlyingObject(PointerOps.front());
+          }))
+        VectorGEPCost += TTI.getScalarizationOverhead(
+            VecTy,
+            APInt::getAllOnes(VecTy->getElementCount().getKnownMinValue()),
+            /*Insert=*/true, /*Extract=*/false, CostKind);
+      else
+        VectorGEPCost +=
+            TTI.getScalarizationOverhead(
+                VecTy,
+                APInt::getOneBitSet(VecTy->getElementCount().getKnownMinValue(),
+                                    0),
+                /*Insert=*/true, /*Extract=*/false, CostKind) +
+            ::getShuffleCost(TTI, TTI::SK_Broadcast, VecTy, std::nullopt,
+                             CostKind);
+      // The cost of scalar loads.
+      InstructionCost ScalarLoadsCost =
+          std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+                          [&](InstructionCost C, Value *V) {
+                            return C + TTI.getInstructionCost(
+                                           cast<Instruction>(V), CostKind);
+                          }) +
+          ScalarGEPCost;
+      // The cost of masked gather.
+      InstructionCost MaskedGatherCost =
+          TTI.getGatherScatterOpCost(Instruction::Load, VecTy,
+                                     cast<LoadInst>(VL0)->getPointerOperand(),
+                                     /*VariableMask=*/false, CommonAlignment,
+                                     CostKind) +
+          (ProfitableGatherPointers ? 0 : VectorGEPCost);
+      APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
+      InstructionCost GatherCost =
+          TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+                                       /*Extract=*/false, CostKind) +
+          ScalarLoadsCost;
+      // The list of loads is small or perform partial check already - directly
+      // compare masked gather cost and gather cost.
+      constexpr unsigned ListLimit = 4;
+      if (!TryRecursiveCheck || VL.size() < ListLimit)
+        return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
       unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
-      unsigned MinVF = getMinVF(Sz);
-      unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
+      unsigned MinVF = 2;
+      unsigned MaxVF = bit_floor(VL.size() / 2);
       MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
+      DemandedElts.clearAllBits();
+      // Iterate through possible vectorization factors and check if vectorized
+      // + shuffles is better than just gather.
       for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
-        unsigned VectorizedCnt = 0;
         SmallVector<LoadsState> States;
-        for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
-             Cnt += VF, ++VectorizedCnt) {
+        for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
           ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
           SmallVector<unsigned> Order;
           SmallVector<Value *> PointerOps;
           LoadsState LS =
-              canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
+              canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
                                 /*TryRecursiveCheck=*/false);
           // Check that the sorted loads are consecutive.
-          if (LS == LoadsState::Gather)
-            break;
+          if (LS == LoadsState::Gather) {
+            if (BestVF) {
+              DemandedElts.setAllBits();
+              break;
+            }
+            DemandedElts.setBits(Cnt, Cnt + VF);
+            continue;
+          }
           // If need the reorder - consider as high-cost masked gather for now.
           if ((LS == LoadsState::Vectorize ||
                LS == LoadsState::StridedVectorize) &&
@@ -4806,79 +4927,97 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
             LS = LoadsState::ScatterVectorize;
           States.push_back(LS);
         }
+        if (DemandedElts.isAllOnes())
+          // All loads gathered - try smaller VF.
+          continue;
+        InstructionCost ScalarVFGEPCost = 0;
         // Can be vectorized later as a serie of loads/insertelements.
-        if (VectorizedCnt == VL.size() / VF) {
-          // Compare masked gather cost and loads + insersubvector costs.
-          TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-          auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
-              TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
-              CostKind, ScalarTy, VecTy);
-          InstructionCost MaskedGatherCost =
-              TTI.getGatherScatterOpCost(
-                  Instruction::Load, VecTy,
-                  cast<LoadInst>(VL0)->getPointerOperand(),
-                  /*VariableMask=*/false, CommonAlignment, CostKind) +
-              VectorGEPCost - ScalarGEPCost;
-          InstructionCost VecLdCost = 0;
-          auto *SubVecTy = getWidenedType(ScalarTy, VF);
-          for (auto [I, LS] : enumerate(States)) {
-            auto *LI0 = cast<LoadInst>(VL[I * VF]);
-            switch (LS) {
-            case LoadsState::Vectorize: {
-              auto [ScalarGEPCost, VectorGEPCost] =
-                  getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                              LI0->getPointerOperand(), Instruction::Load,
-                              CostKind, ScalarTy, SubVecTy);
-              VecLdCost += TTI.getMemoryOpCost(
-                               Instruction::Load, SubVecTy, LI0->getAlign(),
-                               LI0->getPointerAddressSpace(), CostKind,
-                               TTI::OperandValueInfo()) +
-                           VectorGEPCost - ScalarGEPCost;
-              break;
-            }
-            case LoadsState::StridedVectorize: {
-              auto [ScalarGEPCost, VectorGEPCost] =
-                  getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                              LI0->getPointerOperand(), Instruction::Load,
-                              CostKind, ScalarTy, SubVecTy);
+        InstructionCost VecLdCost = 0;
+        if (!DemandedElts.isZero()) {
+          VecLdCost =
+              TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+                                           /*Extract=*/false, CostKind) +
+              ScalarGEPCost;
+          for (unsigned Idx : seq<unsigned>(VL.size()))
+            if (DemandedElts[Idx])
               VecLdCost +=
-                  TTI.getStridedMemoryOpCost(
-                      Instruction::Load, SubVecTy, LI0->getPointerOperand(),
-                      /*VariableMask=*/false, CommonAlignment, CostKind) +
-                  VectorGEPCost - ScalarGEPCost;
-              break;
-            }
-            case LoadsState::ScatterVectorize: {
-              auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
-                  TTI, ArrayRef(PointerOps).slice(I * VF, VF),
-                  LI0->getPointerOperand(), Instruction::GetElementPtr,
-                  CostKind, ScalarTy, SubVecTy);
-              VecLdCost +=
-                  TTI.getGatherScatterOpCost(
-                      Instruction::Load, SubVecTy, LI0->getPointerOperand(),
-                      /*VariableMask=*/false, CommonAlignment, CostKind) +
-                  VectorGEPCost - ScalarGEPCost;
-              break;
-            }
-            case LoadsState::Gather:
-              llvm_unreachable(
-                  "Expected only consecutive, strided or masked gather loads.");
-            }
-            SmallVector<int> ShuffleMask(VL.size());
-            for (int Idx : seq<int>(0, VL.size()))
-              ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+                  TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
+        }
+        auto *SubVecTy = getWidenedType(ScalarTy, VF);
+        for (auto [I, LS] : enumerate(States)) {
+          auto *LI0 = cast<LoadInst>(VL[I * VF]);
+          InstructionCost VectorGEPCost =
+              (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
+                  ? 0
+                  : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
+                                LI0->getPointerOperand(),
+                                Instruction::GetElementPtr, CostKind, ScalarTy,
+                                SubVecTy)
+                        .second;
+          if (LS == LoadsState::ScatterVectorize) {
+            if (static_cast<unsigned>(
+                    count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
+                    PointerOps.size() - 1 ||
+                any_of(PointerOps, [&](Value *V) {
+                  return getUnderlyingObject(V) !=
+                         getUnderlyingObject(PointerOps.front());
+                }))
+              VectorGEPCost += TTI.getScalarizationOverhead(
+                  SubVecTy, APInt::getAllOnes(VF),
+                  /*Insert=*/true, /*Extract=*/false, CostKind);
+            else
+              VectorGEPCost +=
+                  TTI.getScalarizationOverhead(
+                      SubVecTy, APInt::getOneBitSet(VF, 0),
+                      /*Insert=*/true, /*Extract=*/false, CostKind) +
+                  ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy,
+                                   std::nullopt, CostKind);
+          }
+          switch (LS) {
+          case LoadsState::Vectorize:
+            VecLdCost += TTI.getMemoryOpCost(
+                             Instruction::Load, SubVecTy, LI0->getAlign(),
+                             LI0->getPointerAddressSpace(), CostKind,
+                             TTI::OperandValueInfo()) +
+                         VectorGEPCost;
+            break;
+          case LoadsState::StridedVectorize:
+            VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
+                                                    LI0->getPointerOperand(),
+                                                    /*VariableMask=*/false,
+                                                    CommonAlignment, CostKind) +
+                         VectorGEPCost;
+            break;
+          case LoadsState::ScatterVectorize:
+            VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
+                                                    LI0->getPointerOperand(),
+                                                    /*VariableMask=*/false,
+                                                    CommonAlignment, CostKind) +
+                         VectorGEPCost;
+            break;
+          case LoadsState::Gather:
+            // Gathers are already calculated - ignore.
+            continue;
+          }
+          SmallVector<int> ShuffleMask(VL.size());
+          for (int Idx : seq<int>(0, VL.size()))
+            ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
+          if (I > 0)
             VecLdCost +=
                 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy,
                                  ShuffleMask, CostKind, I * VF, SubVecTy);
-          }
-          // If masked gather cost is higher - better to vectorize, so
-          // consider it as a gather node. It will be better estimated
-          // later.
-          if (MaskedGatherCost >= VecLdCost)
-            return true;
+        }
+        // If masked gather cost is higher - better to vectorize, so
+        // consider it as a gather node. It will be better estimated
+        // later.
+        if (MaskedGatherCost >= VecLdCost &&
+            VecLdCost - GatherCost < -SLPCostThreshold) {
+          if (BestVF)
+            *BestVF = VF;
+          return true;
         }
       }
-      return false;
+      return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
     };
     // TODO: need to improve analysis of the pointers, if not all of them are
     // GEPs or have > 2 operands, we end up with a gather node, which just
@@ -4900,7 +5039,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
           !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
         // Check if potential masked gather can be represented as series
         // of loads + insertsubvectors.
-        if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
+        if (CheckForShuffledLoads(CommonAlignment, BestVF,
+                                  ProfitableGatherPointers)) {
           // If masked gather cost is higher - better to vectorize, so
           // consider it as a gather node. It will be better estimated
           // later.
@@ -5327,6 +5467,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     if (TE.Scalars.size() >= 4)
       if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
         return Order;
+    // Check if can include the order of vectorized loads. For masked gathers do
+    // extra analysis later, so include such nodes into a special list.
+    if (TE.isGather() && TE.getOpcode() == Instruction::Load) {
+      SmallVector<Value *> PointerOps;
+      OrdersType CurrentOrder;
+      LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
+                                         CurrentOrder, PointerOps);
+      if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize)
+        return std::move(CurrentOrder);
+    }
     if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
       return CurrentOrder;
   }
@@ -6245,6 +6395,513 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
   buildTree_rec(Roots, 0, EdgeInfo());
 }
 
+/// Tries to find subvector of loads and builds new vector of only loads if can
+/// be profitable.
+static void gatherPossiblyVectorizableLoads(
+    const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
+    ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
+    bool AddNew = true) {
+  if (VL.empty())
+    return;
+  if (!isValidElementType(VL.front()->getType()))
+    return;
+  Type *ScalarTy = VL.front()->getType();
+  int NumScalars = VL.size();
+  auto *VecTy = getWidenedType(ScalarTy, NumScalars);
+  int NumParts = TTI.getNumberOfParts(VecTy);
+  if (NumParts == 0 || NumParts >= NumScalars)
+    NumParts = 1;
+  unsigned VF = PowerOf2Ceil(NumScalars / NumParts);
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;
+  for (int I : seq<int>(0, NumParts)) {
+    for (Value *V :
+         VL.slice(I * VF, std::min<unsigned>(VF, VL.size() - I * VF))) {
+      auto *LI = dyn_cast<LoadInst>(V);
+      if (!LI)
+        continue;
+      if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
+        continue;
+      bool IsFound = false;
+      for (auto &Data : ClusteredLoads) {
+        if (LI->getParent() != Data.front().first->getParent())
+          continue;
+        std::optional<int> Dist =
+            getPointersDiff(LI->getType(), LI->getPointerOperand(),
+                            Data.front().first->getType(),
+                            Data.front().first->getPointerOperand(), DL, SE,
+                            /*StrictCheck=*/true);
+        if (Dist && all_of(Data, [&](const std::pair<LoadInst *, int> &Pair) {
+              IsFound |= Pair.first == LI;
+              return IsFound || Pair.second != *Dist;
+            })) {
+          if (!IsFound)
+            Data.emplace_back(LI, *Dist);
+          IsFound = true;
+          break;
+        }
+      }
+      if (!IsFound)
+        ClusteredLoads.emplace_back().emplace_back(LI, 0);
+    }
+  }
+  auto FindMatchingLoads =
+      [&](ArrayRef<std::pair<LoadInst *, int>> Loads,
+          SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>
+              &GatheredLoads,
+          SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
+          int &Offset, unsigned &Start) {
+        SmallVector<std::pair<int, int>> Res;
+        if (Loads.empty())
+          return GatheredLoads.end();
+        LoadInst *LI = Loads.front().first;
+        for (auto [Idx, Data] : enumerate(GatheredLoads)) {
+          if (Idx < Start)
+            continue;
+          ToAdd.clear();
+          if (LI->getParent() != Data.front().first->getParent())
+            continue;
+          std::optional<int> Dist =
+              getPointersDiff(LI->getType(), LI->getPointerOperand(),
+                              Data.front().first->getType(),
+                              Data.front().first->getPointerOperand(), DL, SE,
+                              /*StrictCheck=*/true);
+          if (Dist) {
+            // Found matching gathered loads - check if all loads are unique or
+            // can be effectively vectorized.
+            unsigned NumUniques = 0;
+            for (auto [Cnt, Pair] : enumerate(Loads)) {
+              bool Used = any_of(
+                  Data, [&, &P = Pair](const std::pair<LoadInst *, int> &PD) {
+                    return PD.first == P.first;
+                  });
+              if (none_of(Data,
+                          [&, &P = Pair](const std::pair<LoadInst *, int> &PD) {
+                            return *Dist + P.second == PD.second;
+                          }) &&
+                  !Used) {
+                ++NumUniques;
+                ToAdd.insert(Cnt);
+              }
+              if (Used)
+                Repeated.insert(Cnt);
+            }
+            if (NumUniques > 0 &&
+                (Loads.size() == NumUniques ||
+                 (Loads.size() - NumUniques >= 2 &&
+                  Loads.size() - NumUniques >= Loads.size() / 2 &&
+                  (isPowerOf2_64(Data.size() + NumUniques) ||
+                   PowerOf2Ceil(Data.size()) <
+                       PowerOf2Ceil(Data.size() + NumUniques))))) {
+              Offset = *Dist;
+              Start = Idx + 1;
+              return std::next(GatheredLoads.begin(), Idx);
+            }
+          }
+        }
+        ToAdd.clear();
+        return GatheredLoads.end();
+      };
+  for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
+    unsigned Start = 0;
+    SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
+    int Offset = 0;
+    auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
+                                 Offset, Start);
+    while (It != GatheredLoads.end()) {
+      assert(!LocalToAdd.empty() && "Expected some elements to add.");
+      for (unsigned Idx : LocalToAdd)
+        It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
+      ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
+      It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
+                             Start);
+    }
+    if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
+          return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
+        })) {
+      auto AddNewLoads =
+          [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {
+            for (unsigned Idx : seq<unsigned>(Data.size())) {
+              if (ToAdd.contains(Idx) || Repeated.contains(Idx))
+                continue;
+              Loads.push_back(Data[Idx]);
+            }
+          };
+      if (!AddNew) {
+        LoadInst *LI = Data.front().first;
+        It = find_if(
+            GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
+              return PD.front().first->getParent() == LI->getParent() &&
+                     PD.front().first->getType() == LI->getType();
+            });
+        while (It != GatheredLoads.end()) {
+          AddNewLoads(*It);
+          It = std::find_if(
+              std::next(It), GatheredLoads.end(),
+              [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
+                return PD.front().first->getParent() == LI->getParent() &&
+                       PD.front().first->getType() == LI->getType();
+              });
+        }
+      }
+      GatheredLoads.emplace_back().append(Data.begin(), Data.end());
+      AddNewLoads(GatheredLoads.emplace_back());
+    }
+  }
+}
+
+void BoUpSLP::tryToVectorizeGatheredLoads(
+    ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads) {
+  GatheredLoadsEntriesFirst = VectorizableTree.size();
+
+  SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
+      LoadEntriesToVectorize.size());
+  for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
+    Set.insert(VectorizableTree[Idx]->Scalars.begin(),
+               VectorizableTree[Idx]->Scalars.end());
+
+  // Sort loads by distance.
+  auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
+                       const std::pair<LoadInst *, int> &L2) {
+    return L1.second > L2.second;
+  };
+
+  auto GetVectorizedRanges = [this](
+                                 ArrayRef<LoadInst *> Loads,
+                                 BoUpSLP::ValueSet &VectorizedLoads,
+                                 SmallVectorImpl<LoadInst *> &NonVectorized) {
+    SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
+    unsigned StartIdx = 0;
+    SmallVector<int> CandidateVFs;
+    if (VectorizeNonPowerOf2 && isPowerOf2_32(Loads.size() + 1))
+      CandidateVFs.push_back(Loads.size());
+    for (int NumElts = bit_floor(Loads.size()); NumElts > 1; NumElts /= 2) {
+      CandidateVFs.push_back(NumElts);
+      if (VectorizeNonPowerOf2 && NumElts > 2)
+        CandidateVFs.push_back(NumElts - 1);
+    }
+
+    for (int NumElts : CandidateVFs) {
+      SmallVector<unsigned> MaskedGatherVectorized;
+      for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E;
+           ++Cnt) {
+        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
+        if (VectorizedLoads.count(Slice.front()) ||
+            VectorizedLoads.count(Slice.back()))
+          continue;
+        // Check if it is profitable to try vectorizing gathered loads. It is
+        // profitable if we have more than 3 consecutive loads or if we have
+        // less but all users are vectorized or deleted.
+        bool AllowToVectorize =
+            NumElts >= 3 ||
+            any_of(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
+              return TE->isGather() && TE->Scalars.size() == 2 &&
+                     (equal(TE->Scalars, Slice) ||
+                      equal(TE->Scalars, reverse(Slice)));
+            });
+        // Check if it is profitable to vectorize 2-elements loads.
+        if (NumElts == 2) {
+          bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
+              Slice.front()->getType(), ElementCount::getFixed(NumElts));
+          auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
+            for (LoadInst *LI : Slice) {
+              // If single use/user - allow to vectorize.
+              if (LI->hasOneUse())
+                continue;
+              // 1. Check if number of uses equal number of users.
+              // 2. All users are deleted.
+              // 3. The load broadcasts are not allowed or the load is not
+              // broadcasted.
+              if (std::distance(LI->user_begin(), LI->user_end()) !=
+                  LI->getNumUses())
+                return false;
+              for (User *U : LI->users()) {
+                if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
+                  continue;
+                if (const TreeEntry *UTE = getTreeEntry(U)) {
+                  if (!IsLegalBroadcastLoad)
+                    // The broadcast is illegal - vectorize loads.
+                    continue;
+                  for (int I = 0, End = UTE->getNumOperands(); I < End; ++I) {
+                    if (all_of(UTE->getOperand(I),
+                               [LI](Value *V) { return V == LI; }))
+                      // Found legal broadcast - do not vectorize.
+                      return false;
+                  }
+                }
+              }
+            }
+            return true;
+          };
+          AllowToVectorize = CheckIfAllowed(Slice);
+        }
+        if (AllowToVectorize) {
+          SmallVector<Value *> PointerOps;
+          OrdersType CurrentOrder;
+          // Try to build vector load.
+          ArrayRef<Value *> Values(
+              reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+          unsigned BestVF = 0;
+          LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
+                                            PointerOps, &BestVF);
+          if (LS != LoadsState::Gather ||
+              (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
+            if (LS == LoadsState::ScatterVectorize) {
+              if (MaskedGatherVectorized.empty() ||
+                  Cnt >= MaskedGatherVectorized.back() + NumElts)
+                MaskedGatherVectorized.push_back(Cnt);
+              continue;
+            }
+            if (LS != LoadsState::Gather) {
+              Results.emplace_back(Values, LS);
+              VectorizedLoads.insert(Slice.begin(), Slice.end());
+              // If we vectorized initial block, no need to try to vectorize it
+              // again.
+              if (Cnt == StartIdx)
+                StartIdx += NumElts;
+            }
+            // Erase last masked gather candidate, if another candidate within
+            // the range is found to be better.
+            if (!MaskedGatherVectorized.empty() &&
+                Cnt < MaskedGatherVectorized.back() + NumElts)
+              MaskedGatherVectorized.pop_back();
+            Cnt += NumElts - 1;
+            continue;
+          }
+        }
+        // Check if the whole array was vectorized already - exit.
+        if (StartIdx >= Loads.size())
+          break;
+      }
+      // Mark masked gathers candidates as vectorized, if any.
+      for (unsigned Cnt : MaskedGatherVectorized) {
+        ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(Cnt, NumElts);
+        ArrayRef<Value *> Values(
+            reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
+        Results.emplace_back(Values, LoadsState::ScatterVectorize);
+        VectorizedLoads.insert(Slice.begin(), Slice.end());
+        // If we vectorized initial block, no need to try to vectorize it again.
+        if (Cnt == StartIdx)
+          StartIdx += NumElts;
+      }
+    }
+    for (LoadInst *LI : Loads) {
+      if (!VectorizedLoads.contains(LI))
+        NonVectorized.push_back(LI);
+    }
+    return Results;
+  };
+  auto ProcessGatheredLoads = [&](ArrayRef<
+                                  SmallVector<std::pair<LoadInst *, int>>>
+                                      GatheredLoads) {
+    SmallVector<LoadInst *> NonVectorized;
+    for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
+      SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
+      SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
+      transform(LoadsDists, OriginalLoads.begin(),
+                [](const std::pair<LoadInst *, int> &L) { return L.first; });
+      stable_sort(LocalLoadsDists, LoadSorter);
+      SmallVector<LoadInst *> Loads;
+      for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
+        if (!getTreeEntry(L.first))
+          Loads.push_back(L.first);
+      }
+      if (Loads.empty())
+        continue;
+      BoUpSLP::ValueSet VectorizedLoads;
+      SmallVector<LoadInst *> SortedNonVectorized;
+      SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
+          GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized);
+      if (!Results.empty() && !SortedNonVectorized.empty() &&
+          all_of(Results,
+                 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
+                   return P.second == LoadsState::ScatterVectorize;
+                 })) {
+        VectorizedLoads.clear();
+        SmallVector<LoadInst *> UnsortedNonVectorized;
+        SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> UnsortedResults =
+            GetVectorizedRanges(OriginalLoads, VectorizedLoads,
+                                UnsortedNonVectorized);
+        if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
+          SortedNonVectorized.swap(UnsortedNonVectorized);
+          Results.swap(UnsortedResults);
+        }
+      }
+      for (auto [Slice, _] : Results) {
+        LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
+                          << Slice.size() << ")\n");
+        if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
+          for (Value *L : Slice)
+            if (!getTreeEntry(L))
+              SortedNonVectorized.push_back(cast<LoadInst>(L));
+          continue;
+        }
+
+        // Select maximum VF as a maximum of user gathered nodes and
+        // distance between scalar loads in these nodes.
+        unsigned MaxVF = Slice.size();
+        unsigned UserMaxVF = 0;
+        std::optional<unsigned> SegmentedLoadsDistance = 0;
+        std::optional<unsigned> CommonVF = 0;
+        unsigned Order = 0;
+        DenseMap<const TreeEntry *, unsigned> EntryToPosition;
+        SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
+        for (auto [Idx, V] : enumerate(Slice)) {
+          for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
+            UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
+            unsigned Pos = EntryToPosition.try_emplace(E, Idx).first->second;
+            UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
+            if (CommonVF) {
+              if (*CommonVF == 0) {
+                CommonVF = E->Scalars.size();
+                continue;
+              }
+              if (*CommonVF != E->Scalars.size())
+                CommonVF.reset();
+            }
+            if (Pos != Idx && SegmentedLoadsDistance) {
+              if (!DeinterleavedNodes.contains(E) &&
+                  any_of(E->Scalars, [&, Slice = Slice](Value *V) {
+                    if (isa<Constant>(V))
+                      return false;
+                    if (getTreeEntry(V))
+                      return true;
+                    const auto &Nodes = ValueToGatherNodes.at(V);
+                    return (Nodes.size() != 1 || !Nodes.contains(E)) &&
+                           !is_contained(Slice, V);
+                  })) {
+                SegmentedLoadsDistance.reset();
+                continue;
+              }
+              DeinterleavedNodes.insert(E);
+              if (*SegmentedLoadsDistance == 0) {
+                SegmentedLoadsDistance = Idx - Pos;
+                continue;
+              }
+              if ((Idx - Pos) % *SegmentedLoadsDistance != 0 ||
+                  (Idx - Pos) / *SegmentedLoadsDistance < Order)
+                SegmentedLoadsDistance.reset();
+              Order = (Idx - Pos) / SegmentedLoadsDistance.value_or(1);
+            }
+          }
+        }
+        DeinterleavedNodes.clear();
+        unsigned InterleaveFactor = 0;
+        // Check if the large load represents interleaved load operation.
+        if (SegmentedLoadsDistance.value_or(0) > 1 &&
+            CommonVF.value_or(0) != 0) {
+          InterleaveFactor = PowerOf2Ceil(*SegmentedLoadsDistance);
+          unsigned VF = *CommonVF;
+          OrdersType Order;
+          SmallVector<Value *> PointerOps;
+          // Segmented load detected - vectorize at maximum vector factor.
+          if (TTI->isLegalInterleavedAccessType(
+                  getWidenedType(Slice.front()->getType(), VF),
+                  InterleaveFactor, cast<LoadInst>(Slice.front())->getAlign(),
+                  cast<LoadInst>(Slice.front())->getPointerAddressSpace()) &&
+              canVectorizeLoads(Slice, Slice.front(), Order, PointerOps) ==
+                  LoadsState::Vectorize) {
+            UserMaxVF = InterleaveFactor * VF;
+          } else {
+            UserMaxVF = VF;
+            InterleaveFactor = 0;
+          }
+        }
+        // Try to build long masked gather loads.
+        UserMaxVF = PowerOf2Ceil(UserMaxVF);
+        if (InterleaveFactor == 0 &&
+            any_of(seq<unsigned>(Slice.size() / UserMaxVF),
+                   [&, Slice = Slice](unsigned Idx) {
+                     OrdersType Order;
+                     SmallVector<Value *> PointerOps;
+                     return canVectorizeLoads(
+                                Slice.slice(Idx * UserMaxVF, UserMaxVF),
+                                Slice[Idx * UserMaxVF], Order,
+                                PointerOps) == LoadsState::ScatterVectorize;
+                   }))
+          UserMaxVF = MaxVF;
+        // Cannot represent the loads as consecutive vectorizable nodes -
+        // just exit.
+        unsigned ConsecutiveNodesSize = 0;
+        if (!LoadEntriesToVectorize.empty() &&
+            (SegmentedLoadsDistance.value_or(0) == 0 ||
+             CommonVF.value_or(UserMaxVF) == UserMaxVF) &&
+            any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                   [&, Slice = Slice](const auto &P) {
+                     const auto *It = find_if(Slice, [&](Value *V) {
+                       return std::get<1>(P).contains(V);
+                     });
+                     if (It == Slice.end())
+                       return false;
+                     ArrayRef<Value *> VL =
+                         VectorizableTree[std::get<0>(P)]->Scalars;
+                     ConsecutiveNodesSize += VL.size();
+                     unsigned Start = std::distance(Slice.begin(), It);
+                     unsigned Sz = Slice.size() - Start;
+                     return Sz < VL.size() ||
+                            Slice.slice(std::distance(Slice.begin(), It),
+                                        VL.size()) != VL;
+                   }))
+          continue;
+        if (Slice.size() != ConsecutiveNodesSize)
+          MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
+        for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
+          bool IsVectorized = true;
+          for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
+            ArrayRef<Value *> SubSlice = Slice.slice(I, std::min(VF, E - I));
+            if (getTreeEntry(SubSlice.front()))
+              continue;
+            // Check if the subslice is to be-vectorized entry, which is not
+            // equal to entry.
+            if (any_of(
+                    zip(LoadEntriesToVectorize, LoadSetsToVectorize),
+                    [&](const auto &P) {
+                      return !SubSlice.equals(
+                                 VectorizableTree[std::get<0>(P)]->Scalars) &&
+                             set_is_subset(SubSlice, std::get<1>(P));
+                    }))
+              continue;
+            unsigned Sz = VectorizableTree.size();
+            buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
+            if (Sz == VectorizableTree.size()) {
+              IsVectorized = false;
+              // Try non-interleaved vectorization with smaller vector factor.
+              if (InterleaveFactor > 0) {
+                VF = 2 * (MaxVF / InterleaveFactor);
+                InterleaveFactor = 0;
+              }
+              continue;
+            }
+          }
+          if (IsVectorized)
+            break;
+        }
+      }
+      NonVectorized.append(SortedNonVectorized);
+    }
+    return NonVectorized;
+  };
+  SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(GatheredLoads);
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;
+  for (LoadInst *LI : NonVectorized) {
+    // Reinsert non-vectorized loads to other list of loads with the same
+    // base pointers.
+    gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
+                                    FinalGatheredLoads,
+                                    /*AddNew=*/false);
+  }
+  // Final attempt to vectorize non-vectorized loads.
+  (void)ProcessGatheredLoads(FinalGatheredLoads);
+  // Try to vectorize postponed load entries, previously marked as gathered.
+  for (unsigned Idx : LoadEntriesToVectorize)
+    buildTree_rec(VectorizableTree[Idx]->Scalars, 0, EdgeInfo());
+  // If no new entries created, consider it as no gathered loads entries must be
+  // handled.
+  if (static_cast<unsigned>(GatheredLoadsEntriesFirst) ==
+      VectorizableTree.size())
+    GatheredLoadsEntriesFirst = NoGatheredLoads;
+}
+
 /// \return true if the specified list of values has only one instruction that
 /// requires scheduling, false otherwise.
 #ifndef NDEBUG
@@ -6466,7 +7123,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
 
 BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
-    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
+    OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) {
   assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
 
   unsigned ShuffleOrOp =
@@ -6543,8 +7200,20 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     case LoadsState::Vectorize:
       return TreeEntry::Vectorize;
     case LoadsState::ScatterVectorize:
+      if (GatheredLoadsEntriesFirst == NoGatheredLoads &&
+          !VectorizableTree.empty()) {
+        // Delay slow vectorized nodes for better vectorization attempts.
+        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        return TreeEntry::NeedToGather;
+      }
       return TreeEntry::ScatterVectorize;
     case LoadsState::StridedVectorize:
+      if (GatheredLoadsEntriesFirst == NoGatheredLoads &&
+          !VectorizableTree.empty()) {
+        // Delay slow vectorized nodes for better vectorization attempts.
+        LoadEntriesToVectorize.insert(VectorizableTree.size());
+        return TreeEntry::NeedToGather;
+      }
       return TreeEntry::StridedVectorize;
     case LoadsState::Gather:
 #ifndef NDEBUG
@@ -6877,7 +7546,8 @@ class PHIHandler {
 } // namespace
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
-                            const EdgeInfo &UserTreeIdx) {
+                            const EdgeInfo &UserTreeIdx,
+                            unsigned InterleaveFactor) {
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   SmallVector<int> ReuseShuffleIndices;
@@ -7100,7 +7770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
   // Check if this is a duplicate of another entry.
   if (TreeEntry *E = getTreeEntry(S.OpValue)) {
     LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
-    if (!E->isSame(VL)) {
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) {
       auto It = MultiNodeScalars.find(S.OpValue);
       if (It != MultiNodeScalars.end()) {
         auto *TEIt = find_if(It->getSecond(),
@@ -7352,7 +8022,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       switch (State) {
       case TreeEntry::Vectorize:
         TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
-                          ReuseShuffleIndices, CurrentOrder);
+                          ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
         if (CurrentOrder.empty())
           LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         else
@@ -8380,7 +9050,8 @@ void BoUpSLP::transformNodes() {
       unsigned MinVF = getMinVF(2 * Sz);
       if (VL.size() <= 2 ||
           (E.getOpcode() &&
-           (E.isAltShuffle() || E.getOpcode() != Instruction::Load)))
+           (E.isAltShuffle() || E.getOpcode() != Instruction::Load ||
+            LoadEntriesToVectorize.contains(Idx))))
         continue;
       // Try to find vectorizable sequences and transform them into a series of
       // insertvector instructions.
@@ -8403,6 +9074,8 @@ void BoUpSLP::transformNodes() {
             if (PrevSize + 1 == VectorizableTree.size() &&
                 VectorizableTree[PrevSize]->isGather()) {
               VectorizableTree.pop_back();
+              LoadEntriesToVectorize.remove_if(
+                  [&](unsigned Idx) { return Idx == PrevSize; });
               continue;
             }
             E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt);
@@ -8492,6 +9165,19 @@ void BoUpSLP::transformNodes() {
       break;
     }
   }
+  // A list of loads to be gathered during the vectorization process. We can
+  // try to vectorize them at the end, if profitable.
+  SmallVector<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads;
+
+  for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    TreeEntry &E = *TE;
+    if (E.isGather() && !isSplat(E.Scalars))
+      gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI,
+                                      GatheredLoads);
+  }
+  // Try to vectorize gathered loads if this is not just a gather of loads.
+  if (!GatheredLoads.empty())
+    tryToVectorizeGatheredLoads(GatheredLoads);
 }
 
 /// Merges shuffle masks and emits final shuffle instruction, if required. It
@@ -8898,6 +9584,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
             Idx = EMask[Idx];
         }
         CommonVF = E->Scalars.size();
+      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
+                 Factor && E->Scalars.size() != Mask.size() &&
+                 ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,
+                                                               *Factor)) {
+        // Deinterleaved nodes are free.
+        std::iota(CommonMask.begin(), CommonMask.end(), 0);
       }
       ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
       V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
@@ -9543,7 +10235,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         InstructionCost VecCost = VectorCost(CommonCost);
         // Check if the current node must be resized, if the parent node is not
         // resized.
-        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
+        if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 &&
+            (E->getOpcode() != Instruction::Load ||
+             !E->UserTreeIndices.empty())) {
           const EdgeInfo &EI = E->UserTreeIndices.front();
           if ((EI.UserTE->getOpcode() != Instruction::Select ||
                EI.EdgeIdx != 0) &&
@@ -9982,7 +10676,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto *LI0 = cast<LoadInst>(VL0);
     auto GetVectorCost = [&](InstructionCost CommonCost) {
       InstructionCost VecLdCost;
-      if (E->State == TreeEntry::Vectorize) {
+      if (E->State == TreeEntry::Vectorize && !E->getInterleaveFactor()) {
         VecLdCost = TTI->getMemoryOpCost(
             Instruction::Load, VecTy, LI0->getAlign(),
             LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
@@ -9992,6 +10686,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         VecLdCost = TTI->getStridedMemoryOpCost(
             Instruction::Load, VecTy, LI0->getPointerOperand(),
             /*VariableMask=*/false, CommonAlignment, CostKind);
+      } else if (std::optional<unsigned> Factor = E->getInterleaveFactor();
+                 E->State == TreeEntry::Vectorize && Factor.value_or(0) > 0) {
+        VecLdCost = TTI->getInterleavedMemoryOpCost(
+            Instruction::Load, VecTy, *Factor, std::nullopt, LI0->getAlign(),
+            LI0->getPointerAddressSpace(), CostKind);
       } else {
         assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
         Align CommonAlignment =
@@ -10223,8 +10922,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
             ((TE->getOpcode() == Instruction::ExtractElement ||
               all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
              isFixedVectorShuffle(TE->Scalars, Mask)) ||
-            (TE->isGather() && TE->getOpcode() == Instruction::Load &&
-             !TE->isAltShuffle()));
+            (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) ||
+            any_of(TE->Scalars, IsaPred<LoadInst>));
   };
 
   // We only handle trees of heights 1 and 2.
@@ -10689,6 +11388,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       }
     }
 
+    // Exclude cost of gather loads nodes which are not used. These nodes were
+    // built as part of the final attempt to vectorize gathered loads.
+    assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
+           "Expected gather nodes with users only.");
+
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
@@ -10896,7 +11600,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         if (IsProfitablePHIUser) {
           KeepScalar = true;
         } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
-                   ExtraCost - ScalarCost <= TTI::TCC_Basic) {
+                   ExtraCost - ScalarCost <= TTI::TCC_Basic &&
+                   (GatheredLoadsEntriesFirst == NoGatheredLoads ||
+                    Entry->Idx < GatheredLoadsEntriesFirst)) {
           unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
             return ValueToExtUses->contains(V);
           });
@@ -11220,7 +11926,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   Entries.clear();
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
-  const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
+  const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
+                                ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
+                                : TE->UserTreeIndices.front();
   const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
   const BasicBlock *TEInsertBlock = nullptr;
   // Main node of PHI entries keeps the correct order of operands/incoming
@@ -11315,7 +12023,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
       VToTEs.insert(TEPtr);
     }
     if (const TreeEntry *VTE = getTreeEntry(V)) {
-      if (ForOrder) {
+      if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) {
         if (VTE->State != TreeEntry::Vectorize) {
           auto It = MultiNodeScalars.find(V);
           if (It == MultiNodeScalars.end())
@@ -11593,13 +12301,19 @@ BoUpSLP::isGatherShuffledEntry(
          "Expected positive number of registers.");
   Entries.clear();
   // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get())
+  if (TE == VectorizableTree.front().get() &&
+      (GatheredLoadsEntriesFirst == NoGatheredLoads ||
+       none_of(ArrayRef(VectorizableTree).drop_front(),
+               [](const std::unique_ptr<TreeEntry> &TE) {
+                 return !TE->isGather();
+               })))
     return {};
   // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
   if (TE->isNonPowOf2Vec())
     return {};
   Mask.assign(VL.size(), PoisonMaskElem);
-  assert(TE->UserTreeIndices.size() == 1 &&
+  assert((TE->UserTreeIndices.size() == 1 ||
+          TE == VectorizableTree.front().get()) &&
          "Expected only single user of the gather node.");
   assert(VL.size() % NumParts == 0 &&
          "Number of scalars must be divisible by NumParts.");
@@ -11718,17 +12432,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     return *Res.second;
   // Get the basic block this bundle is in. All instructions in the bundle
   // should be in this block (except for extractelement-like instructions with
-  // constant indeces).
+  // constant indecies or gathered loads).
   auto *Front = E->getMainOp();
   auto *BB = Front->getParent();
-  assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
-    if (E->getOpcode() == Instruction::GetElementPtr &&
-        !isa<GetElementPtrInst>(V))
-      return true;
-    auto *I = cast<Instruction>(V);
-    return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
-           isVectorLikeInstWithConstOps(I);
-  }));
+  assert(((GatheredLoadsEntriesFirst != NoGatheredLoads &&
+           E->getOpcode() == Instruction::Load && E->isGather() &&
+           E->Idx < GatheredLoadsEntriesFirst) ||
+          all_of(E->Scalars,
+                 [=](Value *V) -> bool {
+                   if (E->getOpcode() == Instruction::GetElementPtr &&
+                       !isa<GetElementPtrInst>(V))
+                     return true;
+                   auto *I = cast<Instruction>(V);
+                   return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
+                          isVectorLikeInstWithConstOps(I);
+                 })) &&
+         "Expected gathered loads or GEPs or instructions from same basic "
+         "block.");
 
   auto FindLastInst = [&]() {
     Instruction *LastInst = Front;
@@ -11744,7 +12464,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
       assert(((E->getOpcode() == Instruction::GetElementPtr &&
                !isa<GetElementPtrInst>(I)) ||
               (isVectorLikeInstWithConstOps(LastInst) &&
-               isVectorLikeInstWithConstOps(I))) &&
+               isVectorLikeInstWithConstOps(I)) ||
+              (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+               E->getOpcode() == Instruction::Load && E->isGather() &&
+               E->Idx < GatheredLoadsEntriesFirst)) &&
              "Expected vector-like or non-GEP in GEP node insts only.");
       if (!DT->isReachableFromEntry(LastInst->getParent())) {
         LastInst = I;
@@ -11801,6 +12524,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
     return FirstInst;
   };
 
+  // Set insertpoint for gathered loads to the very first load.
+  if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+      E->Idx >= GatheredLoadsEntriesFirst && !E->isGather() &&
+      E->getOpcode() == Instruction::Load) {
+    Res.second = FindFirstInst();
+    return *Res.second;
+  }
   // Set the insert point to the beginning of the basic block if the entry
   // should not be scheduled.
   if (doesNotNeedToSchedule(E->Scalars) ||
@@ -12777,6 +13507,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
     }
     // Gather extracts after we check for full matched gathers only.
     if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+        ((E->getOpcode() == Instruction::Load ||
+          any_of(E->Scalars, IsaPred<LoadInst>)) &&
+         any_of(E->Scalars,
+                [this](Value *V) {
+                  return isa<LoadInst>(V) && getTreeEntry(V);
+                })) ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -14135,6 +14871,18 @@ Value *BoUpSLP::vectorizeTree(
   else
     Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
 
+  // Emit gathered loads first to emit better code for the users of those
+  // gathered loads.
+  for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+    if (GatheredLoadsEntriesFirst != NoGatheredLoads &&
+        TE->Idx >= GatheredLoadsEntriesFirst &&
+        (!TE->isGather() || !TE->UserTreeIndices.empty())) {
+      assert((!TE->UserTreeIndices.empty() ||
+              (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
+             "Expected gathered load node.");
+      (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
+    }
+  }
   // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
   (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
   for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
@@ -14711,10 +15459,15 @@ Value *BoUpSLP::vectorizeTree(
       if (IE->Idx != 0 &&
           !(VectorizableTree.front()->isGather() && isa<LoadInst>(I) &&
             !IE->UserTreeIndices.empty() &&
-            any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) {
-              return EI.UserTE == VectorizableTree.front().get() &&
-                     EI.EdgeIdx == UINT_MAX;
-            })))
+            any_of(IE->UserTreeIndices,
+                   [&](const EdgeInfo &EI) {
+                     return EI.UserTE == VectorizableTree.front().get() &&
+                            EI.EdgeIdx == UINT_MAX;
+                   })) &&
+          !(GatheredLoadsEntriesFirst != NoGatheredLoads &&
+            IE->Idx >= GatheredLoadsEntriesFirst &&
+            VectorizableTree.front()->isGather() &&
+            is_contained(VectorizableTree.front()->Scalars, I)))
         continue;
       SmallVector<SelectInst *> LogicalOpSelects;
       I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
index ff1d6253bec92..fffa626cae0dd 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll
@@ -23,14 +23,12 @@ define void @s116_modified(ptr %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 5, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2)
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> <i32 0, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    store <4 x float> [[TMP10]], ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    store <4 x float> [[TMP8]], ptr [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep1 = getelementptr inbounds float, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
index 67746f2cbf5d2..d4dbb8bbfaf0d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll
@@ -1,8 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
-; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POWER-OF-2 %s
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck %s
 
 define void @vec3_vectorize_call(ptr %Colour, float %0) {
+; NON-POWER-OF-2-LABEL: @vec3_vectorize_call(
+; NON-POWER-OF-2-NEXT:  entry:
+; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
+; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POWER-OF-2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
+; NON-POWER-OF-2-NEXT:    ret void
+;
 ; CHECK-LABEL: @vec3_vectorize_call(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
@@ -28,6 +38,19 @@ entry:
 }
 
 define void @vec3_fmuladd_64(ptr %Colour, double %0) {
+; NON-POWER-OF-2-LABEL: @vec3_fmuladd_64(
+; NON-POWER-OF-2-NEXT:  entry:
+; NON-POWER-OF-2-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
+; NON-POWER-OF-2-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0
+; NON-POWER-OF-2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer
+; NON-POWER-OF-2-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer)
+; NON-POWER-OF-2-NEXT:    [[TMP4:%.*]] = fptrunc <2 x double> [[TMP3]] to <2 x float>
+; NON-POWER-OF-2-NEXT:    store <2 x float> [[TMP4]], ptr [[COLOUR]], align 4
+; NON-POWER-OF-2-NEXT:    [[TMP5:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 0.000000e+00, double 0.000000e+00)
+; NON-POWER-OF-2-NEXT:    [[CONV82:%.*]] = fptrunc double [[TMP5]] to float
+; NON-POWER-OF-2-NEXT:    store float [[CONV82]], ptr [[ARRAYIDX80]], align 4
+; NON-POWER-OF-2-NEXT:    ret void
+;
 ; CHECK-LABEL: @vec3_fmuladd_64(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index f04c359b432b5..9c086abe216c0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -245,34 +245,24 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1
 ; CHECK-NEXT:    [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9
-; CHECK-NEXT:    [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1
-; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10
-; CHECK-NEXT:    [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11
 ; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
 ; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12)
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]]
-; CHECK-NEXT:    store <16 x i8> [[TMP19]], ptr [[PTR]], align 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]]
+; CHECK-NEXT:    store <16 x i8> [[TMP12]], ptr [[PTR]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 36681ecea4f50..d222d87e01b7a 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -11,12 +11,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
-; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
+; CHECK-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP2]] to i32
 ; CHECK-NEXT:    [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
@@ -24,145 +21,154 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5
 ; CHECK-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
+; CHECK-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; CHECK-NEXT:    [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5
-; CHECK-NEXT:    [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP16]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP13:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32>
-; CHECK-NEXT:    [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <2 x i8> [[TMP28]] to <2 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <2 x i8> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP18]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP25]], [[TMP8]]
-; CHECK-NEXT:    [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP17:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP22]], [[TMP30]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i32> [[TMP25]], [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; CHECK-NEXT:    [[TMP29:%.*]] = sub <2 x i32> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP18]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32>
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32>
+; CHECK-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP26]], [[TMP28]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP37]], [[TMP20]]
-; CHECK-NEXT:    [[TMP26:%.*]] = add <2 x i32> [[TMP27]], [[TMP15]]
-; CHECK-NEXT:    [[TMP38:%.*]] = sub <2 x i32> [[TMP15]], [[TMP27]]
-; CHECK-NEXT:    [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0
-; CHECK-NEXT:    [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1
-; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]]
-; CHECK-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]]
-; CHECK-NEXT:    [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
-; CHECK-NEXT:    [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1
-; CHECK-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; CHECK-NEXT:    [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
-; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
-; CHECK-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; CHECK-NEXT:    [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1
-; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = add <2 x i32> [[TMP37]], [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = add <2 x i32> [[TMP31]], [[TMP19]]
+; CHECK-NEXT:    [[TMP35:%.*]] = sub <2 x i32> [[TMP19]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0
+; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP33]], i32 1
+; CHECK-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP59]], [[TMP34]]
+; CHECK-NEXT:    [[ARRAYIDX5_4:%.*]] = getelementptr i8, ptr null, i64 4
+; CHECK-NEXT:    [[TMP41:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
+; CHECK-NEXT:    [[TMP38:%.*]] = load <2 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP42]], [[TMP39]]
+; CHECK-NEXT:    [[TMP46:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
+; CHECK-NEXT:    [[TMP50:%.*]] = shufflevector <2 x i32> [[TMP49]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP45:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_4]], align 1
+; CHECK-NEXT:    [[TMP48:%.*]] = zext <2 x i8> [[TMP45]] to <2 x i32>
+; CHECK-NEXT:    [[TMP51:%.*]] = sub <2 x i32> [[TMP50]], [[TMP48]]
+; CHECK-NEXT:    [[TMP52:%.*]] = shl <2 x i32> [[TMP51]], <i32 16, i32 16>
+; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP52]], [[TMP40]]
+; CHECK-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
+; CHECK-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5
+; CHECK-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
 ; CHECK-NEXT:    [[TMP43:%.*]] = load i8, ptr null, align 1
-; CHECK-NEXT:    [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP33:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32>
-; CHECK-NEXT:    [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; CHECK-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP33]], [[TMP39]]
-; CHECK-NEXT:    [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32>
-; CHECK-NEXT:    [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32>
-; CHECK-NEXT:    [[TMP45:%.*]] = sub <2 x i32> [[TMP42]], [[TMP59]]
-; CHECK-NEXT:    [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP40]]
-; CHECK-NEXT:    [[TMP48:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
-; CHECK-NEXT:    [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32>
-; CHECK-NEXT:    [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]]
+; CHECK-NEXT:    [[TMP57:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
+; CHECK-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP57]] to <2 x i32>
+; CHECK-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
+; CHECK-NEXT:    [[TMP56:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; CHECK-NEXT:    [[TMP55:%.*]] = sub <2 x i32> [[TMP53]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
 ; CHECK-NEXT:    [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1
-; CHECK-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; CHECK-NEXT:    [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32>
-; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]]
+; CHECK-NEXT:    [[TMP58:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
+; CHECK-NEXT:    [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
+; CHECK-NEXT:    [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32>
+; CHECK-NEXT:    [[TMP69:%.*]] = sub <2 x i32> [[TMP58]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP60:%.*]] = add <2 x i32> [[TMP70]], [[TMP52]]
-; CHECK-NEXT:    [[TMP72:%.*]] = add <2 x i32> [[TMP60]], [[TMP62]]
-; CHECK-NEXT:    [[TMP47:%.*]] = sub <2 x i32> [[TMP62]], [[TMP60]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
+; CHECK-NEXT:    [[TMP47:%.*]] = add <2 x i32> [[TMP70]], [[TMP55]]
+; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0
 ; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1
-; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]]
-; CHECK-NEXT:    [[SUB51_3:%.*]] = sub i32 [[TMP74]], [[TMP75]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
+; CHECK-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP75]], [[TMP67]]
+; CHECK-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP67]], [[TMP75]]
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0
 ; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1
-; CHECK-NEXT:    [[ADD55_3:%.*]] = add i32 [[TMP79]], [[TMP61]]
-; CHECK-NEXT:    [[SUB59_3:%.*]] = sub i32 [[TMP61]], [[TMP79]]
+; CHECK-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP79]], [[TMP68]]
+; CHECK-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP68]], [[TMP79]]
+; CHECK-NEXT:    [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
+; CHECK-NEXT:    [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP89:%.*]] = insertelement <2 x i32> [[TMP81]], i32 [[ADD44_3]], i32 0
+; CHECK-NEXT:    [[TMP71:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[ADD46_3]], i32 0
+; CHECK-NEXT:    [[TMP94:%.*]] = sub <2 x i32> [[TMP89]], [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[SUB47_3]], i32 0
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP35]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP99:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0
+; CHECK-NEXT:    [[TMP76:%.*]] = add <2 x i32> [[TMP73]], [[TMP99]]
+; CHECK-NEXT:    [[TMP77:%.*]] = sub <2 x i32> [[TMP99]], [[TMP73]]
 ; CHECK-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; CHECK-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0
 ; CHECK-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[TMP63]], 15
 ; CHECK-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; CHECK-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[CONV]], 15
+; CHECK-NEXT:    [[SHR_I_1:%.*]] = lshr i32 [[TMP59]], 15
 ; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537
 ; CHECK-NEXT:    [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535
-; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
-; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; CHECK-NEXT:    [[ADD94_1:%.*]] = add i32 [[TMP78]], [[TMP103]]
+; CHECK-NEXT:    [[SUB102_1:%.*]] = sub i32 [[TMP103]], [[TMP78]]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[SHR_I49_5:%.*]] = lshr i32 [[TMP80]], 15
 ; CHECK-NEXT:    [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537
 ; CHECK-NEXT:    [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535
-; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; CHECK-NEXT:    [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <2 x i32> [[TMP94]], i32 0
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP94]], i32 1
+; CHECK-NEXT:    [[ADD94_4:%.*]] = add i32 [[TMP82]], [[TMP83]]
+; CHECK-NEXT:    [[SUB102_2:%.*]] = sub i32 [[TMP83]], [[TMP82]]
 ; CHECK-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15
 ; CHECK-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; CHECK-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
-; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
+; CHECK-NEXT:    [[ADD94_5:%.*]] = add i32 [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP85]], [[TMP84]]
 ; CHECK-NEXT:    [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15
 ; CHECK-NEXT:    [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537
 ; CHECK-NEXT:    [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
-; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; CHECK-NEXT:    [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP78]], [[TMP76]]
+; CHECK-NEXT:    [[TMP112:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; CHECK-NEXT:    [[TMP113:%.*]] = shufflevector <4 x i8> [[TMP112]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
+; CHECK-NEXT:    [[TMP90:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP90]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP116:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; CHECK-NEXT:    [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP116]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP111]], [[TMP115]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
-; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
-; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
-; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
+; CHECK-NEXT:    [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP112]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP119:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; CHECK-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i8> [[TMP90]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP101:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; CHECK-NEXT:    [[TMP120:%.*]] = shufflevector <4 x i8> [[TMP116]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
+; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP101]], [[TMP155]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP90:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP90]]
+; CHECK-NEXT:    [[TMP107:%.*]] = sub <2 x i32> [[TMP97]], [[TMP119]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP107]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP86]], [[TMP77]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP98]]
+; CHECK-NEXT:    [[TMP156:%.*]] = sub <2 x i32> [[TMP86]], [[TMP109]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP156]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
@@ -170,10 +176,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
 ; CHECK-NEXT:    [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]]
 ; CHECK-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP238]], [[TMP108]]
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; CHECK-NEXT:    [[TMP160:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
 ; CHECK-NEXT:    [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP94]]
-; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]]
+; CHECK-NEXT:    [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP160]]
+; CHECK-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP160]], [[SUB47]]
 ; CHECK-NEXT:    [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15
 ; CHECK-NEXT:    [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537
 ; CHECK-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
@@ -182,38 +188,41 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP155]]
+; CHECK-NEXT:    [[TMP157:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; CHECK-NEXT:    [[TMP122:%.*]] = shufflevector <4 x i8> [[TMP157]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32>
+; CHECK-NEXT:    [[TMP158:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; CHECK-NEXT:    [[TMP159:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP126:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32>
+; CHECK-NEXT:    [[TMP161:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; CHECK-NEXT:    [[TMP128:%.*]] = shufflevector <4 x i8> [[TMP161]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP162:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP126]], [[TMP162]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP156:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP156]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
+; CHECK-NEXT:    [[TMP163:%.*]] = shufflevector <4 x i8> [[TMP157]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32>
+; CHECK-NEXT:    [[TMP165:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32>
+; CHECK-NEXT:    [[TMP167:%.*]] = shufflevector <4 x i8> [[TMP161]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP169:%.*]] = zext <2 x i8> [[TMP167]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP166]], [[TMP169]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
-; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP137]], [[TMP111]]
-; CHECK-NEXT:    [[TMP120:%.*]] = add <2 x i32> [[TMP136]], [[TMP119]]
+; CHECK-NEXT:    [[TMP170:%.*]] = sub <2 x i32> [[TMP137]], [[TMP164]]
+; CHECK-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP136]], [[TMP170]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0
-; CHECK-NEXT:    [[TMP122:%.*]] = sub <2 x i32> [[TMP117]], [[TMP103]]
-; CHECK-NEXT:    [[TMP123:%.*]] = add <2 x i32> [[TMP125]], [[TMP122]]
-; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP120]], [[TMP123]]
-; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP120]]
-; CHECK-NEXT:    [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
+; CHECK-NEXT:    [[TMP171:%.*]] = sub <2 x i32> [[TMP117]], [[TMP123]]
+; CHECK-NEXT:    [[TMP145:%.*]] = add <2 x i32> [[TMP125]], [[TMP171]]
+; CHECK-NEXT:    [[TMP143:%.*]] = add <2 x i32> [[TMP142]], [[TMP145]]
+; CHECK-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP145]], [[TMP142]]
+; CHECK-NEXT:    [[TMP172:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0
 ; CHECK-NEXT:    [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1
-; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]]
-; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP145]], [[TMP146]]
-; CHECK-NEXT:    [[TMP126:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; CHECK-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP172]]
+; CHECK-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP172]], [[TMP146]]
+; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
 ; CHECK-NEXT:    [[TMP127:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP126]]
-; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP126]], [[TMP127]]
+; CHECK-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP173]]
+; CHECK-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP173]], [[TMP127]]
 ; CHECK-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15
 ; CHECK-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; CHECK-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
@@ -229,7 +238,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]]
 ; CHECK-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
 ; CHECK-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]]
-; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]]
+; CHECK-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP59]]
 ; CHECK-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
 ; CHECK-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]]
 ; CHECK-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]]
@@ -242,7 +251,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
 ; CHECK-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
 ; CHECK-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]]
-; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]]
+; CHECK-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP80]]
 ; CHECK-NEXT:    [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP121]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP130:%.*]] = lshr <2 x i32> [[TMP129]], <i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP131:%.*]] = and <2 x i32> [[TMP130]], <i32 65537, i32 65537>
@@ -259,8 +268,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]]
 ; CHECK-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
 ; CHECK-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
-; CHECK-NEXT:    [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
-; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP142]]
+; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0
+; CHECK-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP168]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1
 ; CHECK-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP154]]
 ; CHECK-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
@@ -285,8 +294,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; CHECK-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]]
 ; CHECK-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; CHECK-NEXT:    [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
-; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP237]]
+; CHECK-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0
+; CHECK-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP179]]
 ; CHECK-NEXT:    [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1
 ; CHECK-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP218]]
 ; CHECK-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
@@ -314,8 +323,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; CHECK-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; CHECK-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; CHECK-NEXT:    [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
-; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]]
+; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0
+; CHECK-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP193]]
 ; CHECK-NEXT:    [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1
 ; CHECK-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]]
 ; CHECK-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
@@ -329,9 +338,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1
-; THR15-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1
-; THR15-NEXT:    [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5
-; THR15-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5
 ; THR15-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3
 ; THR15-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
 ; THR15-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP1]] to i32
@@ -342,9 +348,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1
-; THR15-NEXT:    [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1
-; THR15-NEXT:    [[ARRAYIDX13_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5
-; THR15-NEXT:    [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5
 ; THR15-NEXT:    [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3
 ; THR15-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
 ; THR15-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32
@@ -352,139 +355,147 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]]
 ; THR15-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
 ; THR15-NEXT:    [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
-; THR15-NEXT:    [[TMP4:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1
-; THR15-NEXT:    [[TMP66:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32>
-; THR15-NEXT:    [[TMP6:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1
-; THR15-NEXT:    [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32>
-; THR15-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP66]], [[TMP7]]
-; THR15-NEXT:    [[TMP9:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1
-; THR15-NEXT:    [[TMP10:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32>
-; THR15-NEXT:    [[TMP11:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1
-; THR15-NEXT:    [[TMP12:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32>
-; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP10]], [[TMP12]]
+; THR15-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1
+; THR15-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP6:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32>
+; THR15-NEXT:    [[TMP7:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1
+; THR15-NEXT:    [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32>
+; THR15-NEXT:    [[TMP10:%.*]] = sub <2 x i32> [[TMP6]], [[TMP9]]
+; THR15-NEXT:    [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1
+; THR15-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP18:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i32>
+; THR15-NEXT:    [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1
+; THR15-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP29]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32>
+; THR15-NEXT:    [[TMP13:%.*]] = sub <2 x i32> [[TMP18]], [[TMP16]]
 ; THR15-NEXT:    [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP8]]
-; THR15-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2
-; THR15-NEXT:    [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2
-; THR15-NEXT:    [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6
-; THR15-NEXT:    [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6
-; THR15-NEXT:    [[TMP16:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1
-; THR15-NEXT:    [[TMP17:%.*]] = zext <2 x i8> [[TMP16]] to <2 x i32>
-; THR15-NEXT:    [[TMP18:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1
-; THR15-NEXT:    [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32>
-; THR15-NEXT:    [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]]
-; THR15-NEXT:    [[TMP21:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1
-; THR15-NEXT:    [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32>
-; THR15-NEXT:    [[TMP23:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1
-; THR15-NEXT:    [[TMP24:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
-; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP22]], [[TMP24]]
+; THR15-NEXT:    [[TMP19:%.*]] = add <2 x i32> [[TMP14]], [[TMP10]]
+; THR15-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32>
+; THR15-NEXT:    [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32>
+; THR15-NEXT:    [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]]
+; THR15-NEXT:    [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP30:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32>
+; THR15-NEXT:    [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP29]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP28:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
+; THR15-NEXT:    [[TMP25:%.*]] = sub <2 x i32> [[TMP30]], [[TMP28]]
 ; THR15-NEXT:    [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP20]]
-; THR15-NEXT:    [[TMP28:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0
-; THR15-NEXT:    [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1
-; THR15-NEXT:    [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP28]]
-; THR15-NEXT:    [[SUB45_2:%.*]] = sub i32 [[TMP28]], [[TMP29]]
-; THR15-NEXT:    [[TMP30:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
+; THR15-NEXT:    [[TMP33:%.*]] = add <2 x i32> [[TMP26]], [[TMP24]]
+; THR15-NEXT:    [[TMP27:%.*]] = add <2 x i32> [[TMP33]], [[TMP19]]
+; THR15-NEXT:    [[TMP47:%.*]] = sub <2 x i32> [[TMP19]], [[TMP33]]
+; THR15-NEXT:    [[TMP36:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0
 ; THR15-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1
-; THR15-NEXT:    [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP30]]
-; THR15-NEXT:    [[SUB47_2:%.*]] = sub i32 [[TMP30]], [[TMP31]]
-; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]]
-; THR15-NEXT:    [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]]
-; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]]
-; THR15-NEXT:    [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]]
-; THR15-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT:    [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
+; THR15-NEXT:    [[ADD48_2:%.*]] = add i32 [[TMP31]], [[TMP36]]
+; THR15-NEXT:    [[ARRAYIDX5_4:%.*]] = getelementptr i8, ptr null, i64 4
 ; THR15-NEXT:    [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
+; THR15-NEXT:    [[TMP38:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32>
 ; THR15-NEXT:    [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1
-; THR15-NEXT:    [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
-; THR15-NEXT:    [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]]
-; THR15-NEXT:    [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP38:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32>
-; THR15-NEXT:    [[TMP39:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1
-; THR15-NEXT:    [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32>
-; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP38]], [[TMP40]]
+; THR15-NEXT:    [[TMP39:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32>
+; THR15-NEXT:    [[TMP40:%.*]] = sub <2 x i32> [[TMP38]], [[TMP39]]
+; THR15-NEXT:    [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> <i1 true, i1 true>, i32 2)
+; THR15-NEXT:    [[TMP64:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32>
+; THR15-NEXT:    [[TMP66:%.*]] = shufflevector <2 x i32> [[TMP64]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
+; THR15-NEXT:    [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_4]], align 1
+; THR15-NEXT:    [[TMP50:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32>
+; THR15-NEXT:    [[TMP41:%.*]] = sub <2 x i32> [[TMP66]], [[TMP50]]
 ; THR15-NEXT:    [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP36]]
+; THR15-NEXT:    [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP40]]
 ; THR15-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; THR15-NEXT:    [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2
 ; THR15-NEXT:    [[TMP44:%.*]] = load i8, ptr null, align 1
 ; THR15-NEXT:    [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6
 ; THR15-NEXT:    [[TMP45:%.*]] = load i8, ptr null, align 1
 ; THR15-NEXT:    [[TMP46:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1
-; THR15-NEXT:    [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
+; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32>
 ; THR15-NEXT:    [[TMP48:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1
-; THR15-NEXT:    [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
-; THR15-NEXT:    [[TMP50:%.*]] = sub <2 x i32> [[TMP47]], [[TMP49]]
+; THR15-NEXT:    [[TMP59:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32>
+; THR15-NEXT:    [[TMP61:%.*]] = sub <2 x i32> [[TMP53]], [[TMP59]]
 ; THR15-NEXT:    [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0
 ; THR15-NEXT:    [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1
-; THR15-NEXT:    [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
+; THR15-NEXT:    [[TMP67:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32>
 ; THR15-NEXT:    [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1
-; THR15-NEXT:    [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
-; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]]
+; THR15-NEXT:    [[TMP94:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32>
+; THR15-NEXT:    [[TMP56:%.*]] = sub <2 x i32> [[TMP67]], [[TMP94]]
 ; THR15-NEXT:    [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP50]]
-; THR15-NEXT:    [[TMP59:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
+; THR15-NEXT:    [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP61]]
+; THR15-NEXT:    [[TMP108:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0
 ; THR15-NEXT:    [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1
-; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP59]]
-; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP59]], [[TMP60]]
-; THR15-NEXT:    [[TMP61:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
+; THR15-NEXT:    [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP108]]
+; THR15-NEXT:    [[SUB45_3:%.*]] = sub i32 [[TMP108]], [[TMP60]]
+; THR15-NEXT:    [[TMP109:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0
 ; THR15-NEXT:    [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1
-; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP61]]
-; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP61]], [[TMP62]]
+; THR15-NEXT:    [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP109]]
+; THR15-NEXT:    [[SUB47_3:%.*]] = sub i32 [[TMP109]], [[TMP62]]
 ; THR15-NEXT:    [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]]
-; THR15-NEXT:    [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]]
-; THR15-NEXT:    [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]]
-; THR15-NEXT:    [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
+; THR15-NEXT:    [[TMP68:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP69:%.*]] = insertelement <2 x i32> [[TMP68]], i32 [[ADD44_3]], i32 0
+; THR15-NEXT:    [[TMP70:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[ADD46_3]], i32 0
+; THR15-NEXT:    [[TMP71:%.*]] = sub <2 x i32> [[TMP69]], [[TMP70]]
+; THR15-NEXT:    [[TMP104:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[SUB47_3]], i32 0
+; THR15-NEXT:    [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> <i32 poison, i32 0>
+; THR15-NEXT:    [[TMP116:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_3]], i32 0
+; THR15-NEXT:    [[TMP75:%.*]] = add <2 x i32> [[TMP104]], [[TMP116]]
+; THR15-NEXT:    [[TMP76:%.*]] = sub <2 x i32> [[TMP116]], [[TMP104]]
 ; THR15-NEXT:    [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]]
 ; THR15-NEXT:    [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]]
-; THR15-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0
+; THR15-NEXT:    [[TMP63:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0
 ; THR15-NEXT:    [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15
 ; THR15-NEXT:    [[AND_I:%.*]] = and i32 [[SHR_I]], 65537
 ; THR15-NEXT:    [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535
-; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15
+; THR15-NEXT:    [[SHR_I49:%.*]] = lshr i32 [[TMP31]], 15
 ; THR15-NEXT:    [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537
 ; THR15-NEXT:    [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535
-; THR15-NEXT:    [[ADD55_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]]
-; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]]
-; THR15-NEXT:    [[TMP64:%.*]] = extractelement <2 x i32> [[TMP66]], i32 0
-; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15
+; THR15-NEXT:    [[TMP78:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0
+; THR15-NEXT:    [[TMP79:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1
+; THR15-NEXT:    [[ADD55_1:%.*]] = add i32 [[TMP78]], [[TMP79]]
+; THR15-NEXT:    [[SUB102_1:%.*]] = sub i32 [[TMP79]], [[TMP78]]
+; THR15-NEXT:    [[TMP80:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; THR15-NEXT:    [[SHR_I49_2:%.*]] = lshr i32 [[TMP80]], 15
 ; THR15-NEXT:    [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537
 ; THR15-NEXT:    [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535
-; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]]
-; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]]
+; THR15-NEXT:    [[TMP81:%.*]] = extractelement <2 x i32> [[TMP71]], i32 0
+; THR15-NEXT:    [[TMP82:%.*]] = extractelement <2 x i32> [[TMP71]], i32 1
+; THR15-NEXT:    [[ADD94_2:%.*]] = add i32 [[TMP81]], [[TMP82]]
+; THR15-NEXT:    [[SUB102_2:%.*]] = sub i32 [[TMP82]], [[TMP81]]
 ; THR15-NEXT:    [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15
 ; THR15-NEXT:    [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537
 ; THR15-NEXT:    [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535
-; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]]
-; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]]
+; THR15-NEXT:    [[TMP83:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0
+; THR15-NEXT:    [[TMP112:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1
+; THR15-NEXT:    [[ADD94_3:%.*]] = add i32 [[TMP83]], [[TMP112]]
+; THR15-NEXT:    [[SUB102_3:%.*]] = sub i32 [[TMP112]], [[TMP83]]
 ; THR15-NEXT:    [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15
 ; THR15-NEXT:    [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537
 ; THR15-NEXT:    [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535
 ; THR15-NEXT:    [[TMP65:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; THR15-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32>
-; THR15-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
-; THR15-NEXT:    [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32>
-; THR15-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP81:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP70]], [[TMP81]]
+; THR15-NEXT:    [[TMP87:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
+; THR15-NEXT:    [[TMP117:%.*]] = shufflevector <4 x i8> [[TMP87]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32>
+; THR15-NEXT:    [[TMP130:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
+; THR15-NEXT:    [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP130]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32>
+; THR15-NEXT:    [[TMP93:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1
+; THR15-NEXT:    [[TMP146:%.*]] = shufflevector <4 x i8> [[TMP93]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP120:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32>
+; THR15-NEXT:    [[TMP72:%.*]] = sub <2 x i32> [[TMP118]], [[TMP120]]
 ; THR15-NEXT:    [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32>
-; THR15-NEXT:    [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32>
-; THR15-NEXT:    [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32>
-; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
+; THR15-NEXT:    [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP87]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
+; THR15-NEXT:    [[TMP100:%.*]] = shufflevector <4 x i8> [[TMP130]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP143:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
+; THR15-NEXT:    [[TMP178:%.*]] = shufflevector <4 x i8> [[TMP93]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP147:%.*]] = zext <2 x i8> [[TMP178]] to <2 x i32>
+; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP143]], [[TMP147]]
 ; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
-; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP93]]
+; THR15-NEXT:    [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP99]]
+; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP107]]
 ; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
-; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP87]]
+; THR15-NEXT:    [[TMP110:%.*]] = sub <2 x i32> [[TMP92]], [[TMP111]]
+; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP110]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
 ; THR15-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
@@ -492,10 +503,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1
 ; THR15-NEXT:    [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]]
 ; THR15-NEXT:    [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]]
-; THR15-NEXT:    [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
+; THR15-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0
 ; THR15-NEXT:    [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1
-; THR15-NEXT:    [[ADD56:%.*]] = add i32 [[SUB47]], [[TMP94]]
-; THR15-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]]
+; THR15-NEXT:    [[ADD56:%.*]] = add i32 [[SUB47]], [[TMP161]]
+; THR15-NEXT:    [[SUB59:%.*]] = sub i32 [[TMP161]], [[SUB47]]
 ; THR15-NEXT:    [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15
 ; THR15-NEXT:    [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537
 ; THR15-NEXT:    [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535
@@ -504,38 +515,41 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535
 ; THR15-NEXT:    [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; THR15-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32>
-; THR15-NEXT:    [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
-; THR15-NEXT:    [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP104:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; THR15-NEXT:    [[TMP105:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP112:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32>
-; THR15-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP104]], [[TMP112]]
+; THR15-NEXT:    [[TMP180:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1
+; THR15-NEXT:    [[TMP181:%.*]] = shufflevector <4 x i8> [[TMP180]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP175:%.*]] = zext <2 x i8> [[TMP181]] to <2 x i32>
+; THR15-NEXT:    [[TMP183:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1
+; THR15-NEXT:    [[TMP184:%.*]] = shufflevector <4 x i8> [[TMP183]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP177:%.*]] = zext <2 x i8> [[TMP184]] to <2 x i32>
+; THR15-NEXT:    [[TMP127:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1
+; THR15-NEXT:    [[TMP128:%.*]] = shufflevector <4 x i8> [[TMP127]], <4 x i8> poison, <2 x i32> <i32 0, i32 2>
+; THR15-NEXT:    [[TMP129:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; THR15-NEXT:    [[TMP101:%.*]] = sub <2 x i32> [[TMP177]], [[TMP129]]
 ; THR15-NEXT:    [[TMP102:%.*]] = shl <2 x i32> [[TMP101]], <i32 16, i32 16>
-; THR15-NEXT:    [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP107:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32>
-; THR15-NEXT:    [[TMP108:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP108]] to <2 x i32>
-; THR15-NEXT:    [[TMP110:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; THR15-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32>
-; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
+; THR15-NEXT:    [[TMP187:%.*]] = shufflevector <4 x i8> [[TMP180]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP182:%.*]] = zext <2 x i8> [[TMP187]] to <2 x i32>
+; THR15-NEXT:    [[TMP189:%.*]] = shufflevector <4 x i8> [[TMP183]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP185:%.*]] = zext <2 x i8> [[TMP189]] to <2 x i32>
+; THR15-NEXT:    [[TMP191:%.*]] = shufflevector <4 x i8> [[TMP127]], <4 x i8> poison, <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[TMP186:%.*]] = zext <2 x i8> [[TMP191]] to <2 x i32>
+; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]]
 ; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP115:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV33_1]], i32 1
-; THR15-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
-; THR15-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP114]], [[TMP117]]
+; THR15-NEXT:    [[TMP141:%.*]] = sub <2 x i32> [[TMP115]], [[TMP182]]
+; THR15-NEXT:    [[TMP142:%.*]] = add <2 x i32> [[TMP114]], [[TMP141]]
 ; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
-; THR15-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP99]]
-; THR15-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP102]], [[TMP127]]
-; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP116]], [[TMP128]]
-; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP116]]
-; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
+; THR15-NEXT:    [[TMP188:%.*]] = sub <2 x i32> [[TMP126]], [[TMP175]]
+; THR15-NEXT:    [[TMP145:%.*]] = add <2 x i32> [[TMP102]], [[TMP188]]
+; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP142]], [[TMP145]]
+; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP145]], [[TMP142]]
+; THR15-NEXT:    [[TMP190:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
 ; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
-; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP118]]
-; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP118]], [[TMP119]]
-; THR15-NEXT:    [[TMP129:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
+; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP190]]
+; THR15-NEXT:    [[SUB51_1:%.*]] = sub i32 [[TMP190]], [[TMP119]]
+; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0
 ; THR15-NEXT:    [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1
-; THR15-NEXT:    [[ADD55_4:%.*]] = add i32 [[TMP125]], [[TMP129]]
-; THR15-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP129]], [[TMP125]]
+; THR15-NEXT:    [[ADD55_2:%.*]] = add i32 [[TMP125]], [[TMP150]]
+; THR15-NEXT:    [[SUB59_1:%.*]] = sub i32 [[TMP150]], [[TMP125]]
 ; THR15-NEXT:    [[SHR_I54_1:%.*]] = lshr i32 [[TMP119]], 15
 ; THR15-NEXT:    [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537
 ; THR15-NEXT:    [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535
@@ -551,7 +565,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]]
 ; THR15-NEXT:    [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]]
 ; THR15-NEXT:    [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]]
-; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]]
+; THR15-NEXT:    [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP31]]
 ; THR15-NEXT:    [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]]
 ; THR15-NEXT:    [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP119]]
 ; THR15-NEXT:    [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]]
@@ -559,13 +573,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]]
 ; THR15-NEXT:    [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]]
 ; THR15-NEXT:    [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]]
-; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[ADD55_4]], [[ADD56]]
-; THR15-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD56]], [[ADD55_4]]
+; THR15-NEXT:    [[ADD55:%.*]] = add i32 [[ADD55_2]], [[ADD56]]
+; THR15-NEXT:    [[SUB86_1:%.*]] = sub i32 [[ADD56]], [[ADD55_2]]
 ; THR15-NEXT:    [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]]
 ; THR15-NEXT:    [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]]
 ; THR15-NEXT:    [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_1]]
-; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP64]]
-; THR15-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP121]], <2 x i32> <i32 1, i32 3>
+; THR15-NEXT:    [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP80]]
+; THR15-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP21]], <2 x i32> [[TMP121]], <2 x i32> <i32 0, i32 3>
 ; THR15-NEXT:    [[TMP132:%.*]] = lshr <2 x i32> [[TMP5]], <i32 15, i32 15>
 ; THR15-NEXT:    [[TMP133:%.*]] = and <2 x i32> [[TMP132]], <i32 65537, i32 65537>
 ; THR15-NEXT:    [[TMP134:%.*]] = mul <2 x i32> [[TMP133]], <i32 65535, i32 65535>
@@ -581,8 +595,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]]
 ; THR15-NEXT:    [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]]
 ; THR15-NEXT:    [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]]
-; THR15-NEXT:    [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
-; THR15-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP150]]
+; THR15-NEXT:    [[TMP192:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0
+; THR15-NEXT:    [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP192]]
 ; THR15-NEXT:    [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1
 ; THR15-NEXT:    [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP151]]
 ; THR15-NEXT:    [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]]
@@ -607,8 +621,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]]
 ; THR15-NEXT:    [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]]
 ; THR15-NEXT:    [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]]
-; THR15-NEXT:    [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
-; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]]
+; THR15-NEXT:    [[TMP179:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0
+; THR15-NEXT:    [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP179]]
 ; THR15-NEXT:    [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1
 ; THR15-NEXT:    [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]]
 ; THR15-NEXT:    [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]]
@@ -636,8 +650,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]]
 ; THR15-NEXT:    [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]]
 ; THR15-NEXT:    [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]]
-; THR15-NEXT:    [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
-; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]]
+; THR15-NEXT:    [[TMP193:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0
+; THR15-NEXT:    [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP193]]
 ; THR15-NEXT:    [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1
 ; THR15-NEXT:    [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]]
 ; THR15-NEXT:    [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
index 3fa42047162e4..9c1da08c64b7b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll
@@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) {
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]]
+; CHECK-NEXT:    [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> poison)
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
index 2daa3b58e5c3a..98333c7b420cf 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll
@@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) {
 ; CHECK-LABEL: define <4 x i32> @test(
 ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index 54eb564768318..6876ca7fc351e 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -6,10 +6,11 @@
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr @dst, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    store <4 x double> [[TMP4]], ptr @dst, align 8
 ; CHECK-NEXT:    ret void
 ;
   %a0 = load double, ptr @src, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
index af354bb06ad46..4de16a5d57793 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll
@@ -10,10 +10,8 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n
 ; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__FIRST:%.*]], align 8
-; CHECK-NEXT:    [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__FIRST]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__LAST:%.*]], align 8
-; CHECK-NEXT:    [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__LAST]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[_M_FIRST3_I_I83]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x ptr>, ptr [[__LAST:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]]
 ; CHECK:       while.cond.i.preheader:
 ; CHECK-NEXT:    br label [[WHILE_COND_I:%.*]]
@@ -22,10 +20,8 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]]
 ; CHECK:       _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi ptr [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
-; CHECK-NEXT:    store ptr [[TMP4]], ptr [[__FIRST]], align 8
-; CHECK-NEXT:    store ptr [[TMP3]], ptr [[_M_FIRST3_I_I]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x ptr> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[__FIRST]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]]
 ; CHECK:       if.then.i55:
 ; CHECK-NEXT:    br label [[WHILE_COND]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index b0d9fea43a0e6..d1f93eccc2a91 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -837,21 +837,18 @@ define i32 @maxi8_mutiple_uses(i32) {
 ; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
 ; THRESH-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; THRESH-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; THRESH-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; THRESH-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
-; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0
-; THRESH-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1
-; THRESH-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
-; THRESH-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1
-; THRESH-NEXT:    [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]]
-; THRESH-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]]
-; THRESH-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0
-; THRESH-NEXT:    [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1
-; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]]
-; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]]
-; THRESH-NEXT:    [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4
-; THRESH-NEXT:    store i32 [[TMP19]], ptr @var, align 8
+; THRESH-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]])
+; THRESH-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; THRESH-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0
+; THRESH-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1
+; THRESH-NEXT:    [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]]
+; THRESH-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]]
+; THRESH-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0
+; THRESH-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
+; THRESH-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]]
+; THRESH-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]]
+; THRESH-NEXT:    [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4
+; THRESH-NEXT:    store i32 [[TMP16]], ptr @var, align 8
 ; THRESH-NEXT:    ret i32 [[OP_RDX5]]
 ;
   %2 = load i32, ptr @arr, align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
index 9a41c1dc5de22..4f94784a24dd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll
@@ -100,21 +100,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[T4:%.*]] = load i64, ptr [[T2]], align 8
-; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[T15]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %t1 = load i64, ptr %x, align 16
   %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
index bc8e6626e5508..700e3ed9effc4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll
@@ -100,21 +100,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
 
 define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
 ; CHECK-LABEL: @PR16739_byval(
-; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2
-; CHECK-NEXT:    [[T4:%.*]] = load i64, ptr [[T2]], align 8
-; CHECK-NEXT:    [[T5:%.*]] = trunc i64 [[T1]] to i32
-; CHECK-NEXT:    [[T6:%.*]] = bitcast i32 [[T5]] to float
-; CHECK-NEXT:    [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
+; CHECK-NEXT:    [[T1:%.*]] = load i64, ptr [[X]], align 16
 ; CHECK-NEXT:    [[T8:%.*]] = lshr i64 [[T1]], 32
-; CHECK-NEXT:    [[T9:%.*]] = trunc i64 [[T8]] to i32
-; CHECK-NEXT:    [[T10:%.*]] = bitcast i32 [[T9]] to float
-; CHECK-NEXT:    [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1
-; CHECK-NEXT:    [[T12:%.*]] = trunc i64 [[T4]] to i32
-; CHECK-NEXT:    [[T13:%.*]] = bitcast i32 [[T12]] to float
-; CHECK-NEXT:    [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2
-; CHECK-NEXT:    [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[T15]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
+; CHECK-NEXT:    ret <4 x float> [[TMP5]]
 ;
   %t1 = load i64, ptr %x, align 16
   %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index 5a28581913b8c..c3122d991da20 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -204,25 +204,21 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr
 ; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
-; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
-; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
-; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8
 ; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -284,24 +280,22 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S
 ; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2
 ; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2
 ; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
-; CHECK-NEXT:    [[B0:%.*]] = load double, ptr [[B]], align 8
 ; CHECK-NEXT:    [[C0:%.*]] = load double, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[D0:%.*]] = load double, ptr [[D:%.*]], align 8
 ; CHECK-NEXT:    [[B2:%.*]] = load double, ptr [[IDXB2]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = load double, ptr [[IDXA2]], align 8
-; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; CHECK-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
+; CHECK-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]]
-; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8
 ; CHECK-NEXT:    store double [[A1]], ptr [[EXT1:%.*]], align 8
 ; CHECK-NEXT:    store double [[A1]], ptr [[EXT2:%.*]], align 8
 ; CHECK-NEXT:    store double [[A1]], ptr [[EXT3:%.*]], align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 5b33c6e889363..19cbce0767c92 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -533,161 +541,149 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
-; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]]
-; SSE-NEXT:    store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX2-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 09d6c77557efa..9ac4208c63285 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load_2(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0
-; AVX512VL-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
-; AVX512VL-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison), !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 1, i32 2, i32 3, i32 4>
-; AVX512VL-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
+; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX512VL-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX512VL-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; AVX512VL-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
+; AVX512VL-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2
+; AVX512VL-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
+; AVX512VL-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
+; AVX512VL-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i32, ptr %1, i64 1
@@ -533,161 +541,149 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
-; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
-; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
-; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
-; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3
-; SSE-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0
-; SSE-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1
-; SSE-NEXT:    [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2
-; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
-; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
-; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; SSE-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
+; SSE-NEXT:    [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
+; SSE-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
+; SSE-NEXT:    [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
+; SSE-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3
+; SSE-NEXT:    [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]]
+; SSE-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; SSE-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
-; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
-; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2
-; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3
-; SSE-NEXT:    [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0
-; SSE-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1
-; SSE-NEXT:    [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2
-; SSE-NEXT:    [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3
-; SSE-NEXT:    [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]]
-; SSE-NEXT:    store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
+; SSE-NEXT:    [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0
+; SSE-NEXT:    [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1
+; SSE-NEXT:    [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2
+; SSE-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3
+; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0
+; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1
+; SSE-NEXT:    [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2
+; SSE-NEXT:    [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3
+; SSE-NEXT:    [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]]
+; SSE-NEXT:    store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
-; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
-; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
-; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
-; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
-; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
-; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
-; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
-; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
-; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
-; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
-; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
-; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
-; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
-; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
-; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2
-; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3
-; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4
-; AVX2-NEXT:    [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5
-; AVX2-NEXT:    [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6
-; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7
-; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0
-; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1
-; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2
-; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3
-; AVX2-NEXT:    [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4
-; AVX2-NEXT:    [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5
-; AVX2-NEXT:    [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6
-; AVX2-NEXT:    [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
-; AVX2-NEXT:    [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
-; AVX2-NEXT:    store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
+; AVX2-NEXT:    [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
+; AVX2-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
+; AVX2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AVX2-NEXT:    [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
+; AVX2-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
+; AVX2-NEXT:    [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
+; AVX2-NEXT:    [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
+; AVX2-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
+; AVX2-NEXT:    [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
+; AVX2-NEXT:    [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> <i32 0, i32 1, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> <i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> <i32 0, i32 1, i32 2, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4
+; AVX2-NEXT:    [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5
+; AVX2-NEXT:    [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6
+; AVX2-NEXT:    [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7
+; AVX2-NEXT:    [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> <i32 0, i32 1, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3
+; AVX2-NEXT:    [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4
+; AVX2-NEXT:    [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5
+; AVX2-NEXT:    [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6
+; AVX2-NEXT:    [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7
+; AVX2-NEXT:    [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]]
+; AVX2-NEXT:    store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_div(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
index 249b51592760c..92a4095c7c57a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll
@@ -71,120 +71,66 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2
 ;
 ; AVX-LABEL: @compute_min(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; AVX-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
-; AVX-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1
-; AVX-NEXT:    [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1
-; AVX-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
-; AVX-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
-; AVX-NEXT:    [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
-; AVX-NEXT:    [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
-; AVX-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2
-; AVX-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2
-; AVX-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3
-; AVX-NEXT:    [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3
-; AVX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2
-; AVX-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2
-; AVX-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
-; AVX-NEXT:    [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
-; AVX-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
-; AVX-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2
-; AVX-NEXT:    [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
-; AVX-NEXT:    [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
-; AVX-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
-; AVX-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
-; AVX-NEXT:    [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
-; AVX-NEXT:    [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
-; AVX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2
-; AVX-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2
-; AVX-NEXT:    [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]])
-; AVX-NEXT:    [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7
-; AVX-NEXT:    [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7
-; AVX-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2
-; AVX-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2
-; AVX-NEXT:    [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]])
-; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32
-; AVX-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
-; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
-; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32
-; AVX-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
-; AVX-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
-; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64
-; AVX-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
-; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
+; AVX-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
+; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]])
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX-NEXT:    [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]])
+; AVX-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]])
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]])
+; AVX-NEXT:    [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64>
+; AVX-NEXT:    [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], <i64 32, i64 48>
+; AVX-NEXT:    [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64>
+; AVX-NEXT:    [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], <i64 48, i64 32>
+; AVX-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]]
+; AVX-NEXT:    [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
+; AVX-NEXT:    [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], <i64 16, i64 16>
+; AVX-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]]
+; AVX-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
+; AVX-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]]
+; AVX-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
+; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
+; AVX-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
+; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1
 ; AVX-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
 ;
 ; AVX2-LABEL: @compute_min(
 ; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; AVX2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2
-; AVX2-NEXT:    [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1
-; AVX2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2
-; AVX2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2
-; AVX2-NEXT:    [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2
-; AVX2-NEXT:    [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4
-; AVX2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2
-; AVX2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2
-; AVX2-NEXT:    [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5
-; AVX2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2
-; AVX2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2
-; AVX2-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]])
-; AVX2-NEXT:    [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6
-; AVX2-NEXT:    [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6
-; AVX2-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2
-; AVX2-NEXT:    [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2
-; AVX2-NEXT:    [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]])
-; AVX2-NEXT:    [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64>
-; AVX2-NEXT:    [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], <i64 32, i64 48>
-; AVX2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0
-; AVX2-NEXT:    [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1
-; AVX2-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]]
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16
-; AVX2-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
-; AVX2-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]]
-; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
-; AVX2-NEXT:    [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2
-; AVX2-NEXT:    [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2
-; AVX2-NEXT:    [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]])
-; AVX2-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64>
-; AVX2-NEXT:    [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], <i64 32, i64 48>
+; AVX2-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 0, i32 4>
+; AVX2-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]])
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 1, i32 5>
+; AVX2-NEXT:    [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]])
+; AVX2-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 3, i32 6>
+; AVX2-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]])
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> <i32 2, i32 7>
+; AVX2-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]])
+; AVX2-NEXT:    [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64>
+; AVX2-NEXT:    [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], <i64 32, i64 48>
+; AVX2-NEXT:    [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64>
+; AVX2-NEXT:    [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], <i64 48, i64 32>
+; AVX2-NEXT:    [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]]
+; AVX2-NEXT:    [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64>
+; AVX2-NEXT:    [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], <i64 16, i64 16>
+; AVX2-NEXT:    [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]]
+; AVX2-NEXT:    [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
+; AVX2-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]]
 ; AVX2-NEXT:    [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0
+; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
 ; AVX2-NEXT:    [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1
-; AVX2-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]]
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16
-; AVX2-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
-; AVX2-NEXT:    [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64
-; AVX2-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]]
-; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
+; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1
 ; AVX2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 8b2b15283601a..c0835fe56f727 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -25,10 +25,10 @@ define i32 @test(ptr nocapture readonly %p) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <8 x i32> [[TMP0]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP2]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
@@ -97,11 +97,11 @@ define i32 @test2(ptr nocapture readonly %p, ptr nocapture readonly %q) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
@@ -186,12 +186,12 @@ define i32 @test3(ptr nocapture readonly %p, ptr nocapture readonly %q) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP5]], [[SUM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <8 x i32> [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SUM]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret i32 [[OP_RDX]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 83457cc4966f7..729d5fd5546dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,19 +10,39 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
 define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
+; CHECK-NEXT:    [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT:    [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
 ; CHECK-NEXT:    [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0
-; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1
+; CHECK-NEXT:    [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
+; CHECK-NEXT:    [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
+; CHECK-NEXT:    [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
+; CHECK-NEXT:    [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
+; CHECK-NEXT:    [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
+; CHECK-NEXT:    [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
+; CHECK-NEXT:    [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
+; CHECK-NEXT:    [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
+; CHECK-NEXT:    [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
+; CHECK-NEXT:    [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
+; CHECK-NEXT:    [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
+; CHECK-NEXT:    [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
+; CHECK-NEXT:    [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
+; CHECK-NEXT:    [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x double> poison, double [[LD1_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[LD1_1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[LD1_2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[LD1_3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[LD1_4]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[LD1_5]], i32 5
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[LD1_6]], i32 6
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[LD1_7]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <8 x double> [[TMP0]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <8 x double> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP12]])
+; CHECK-NEXT:    [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
+; CHECK-NEXT:    [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP13]], i64 1
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> <i1 true, i1 true>)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
index 40dcc79f79ffc..09a5ace101e64 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll
@@ -8,19 +8,23 @@
 ; YAML-NEXT:  Function:        test
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-5'
+; YAML-NEXT:    - Cost:            '-7'
 ; YAML-NEXT:    - String:          ' and with tree size '
-; YAML-NEXT:    - TreeSize:        '7'
+; YAML-NEXT:    - TreeSize:        '5'
 
 define void @test(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> <i64 0, i64 32, i64 33, i64 34>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
+; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
index 26c4d55436d22..7f5d803391343 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll
@@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) {
   ; YAML-NEXT:   - String:          'Vectorized horizontal reduction with cost '
   ; YAML-NEXT:   - Cost:            '-1'
   ; YAML-NEXT:   - String:          ' and with tree size '
-  ; YAML-NEXT:   - TreeSize:        '7'
+  ; YAML-NEXT:   - TreeSize:        '4'
 entry:
   %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1
   %idx0 = load i32, ptr %off0.1, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
index 787bd39759dc7..228967e63d1ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll
@@ -9,11 +9,11 @@ define ptr @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double>
 ; CHECK-NEXT:    br label [[BODY:%.*]]
 ; CHECK:       body:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP5:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP6:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i16>, ptr null, align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[SHUFFLE]], <2 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP6]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP5]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    br label [[BODY]]
 ;
 entry:
@@ -54,8 +54,8 @@ define void @test1(ptr %agg.result, ptr %this) {
 ; CHECK:       return:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ [[TMP1]], [[IF_END]] ], [ <float 1.000000e+00, float 0.000000e+00>, [[LOR_LHS_FALSE]] ], [ <float 1.000000e+00, float 0.000000e+00>, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[C_I_I_I:%.*]] = getelementptr inbounds float, ptr [[AGG_RESULT:%.*]], i32 2
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    store <2 x float> [[SHUFFLE]], ptr [[C_I_I_I]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    store <2 x float> [[TMP3]], ptr [[C_I_I_I]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
index cfbbe14186b50..8786e1a92a326 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll
@@ -5,13 +5,20 @@ define void @test() {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -57,15 +64,22 @@ define void @test1() {
 ; CHECK-LABEL: define void @test1(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -111,12 +125,19 @@ define void @test_div() {
 ; CHECK-LABEL: define void @test_div(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv <4 x i32> [[TMP2]], <i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
@@ -163,12 +184,19 @@ define void @test_rem() {
 ; CHECK-LABEL: define void @test_rem(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60
-; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> <i64 1, i64 33, i64 7, i64 0>), i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, ptr null, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], <i32 1, i32 2, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP7]], i32 2
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = urem <4 x i32> [[TMP2]], <i32 1, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index 30f328293cdaa..c114c5dee78e9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -6,18 +6,20 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
-; CHECK-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24
+; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2)
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> <i32 0, i32 5, i32 2, i32 7, i32 4, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    store <8 x float> [[TMP15]], ptr addrspace(1) [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
index f0e734d8c5aef..2658317e97927 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll
@@ -9,8 +9,8 @@ define  void @foo (ptr %A, ptr %B, ptr %Result) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 256, 0
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], ptr [[A:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B:%.*]], i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4
@@ -26,13 +26,13 @@ define  void @foo (ptr %A, ptr %B, ptr %Result) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fsub <2 x float> [[TMP11]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x float> [[TMP11]], [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP20]] = fadd <2 x float> [[TMP2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP18]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
-; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[RESULT:%.*]], align 4
+; CHECK-NEXT:    store <2 x float> [[TMP19]], ptr [[RESULT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
index 82085ade519e2..360b258f216c5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll
@@ -12,17 +12,17 @@ define void @test() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr undef, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float 0.000000e+00, float poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> <float poison, float 0.000000e+00>, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]])
 ; CHECK-NEXT:    br i1 false, label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x float> [ [[TMP8]], [[BB2]] ], [ zeroinitializer, [[BB1]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
index 9810d50beea73..8497493e0069c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll
@@ -11,82 +11,58 @@ declare double @llvm.sin.f64(double)
 
 define void @test() {
 ; CHECK-LABEL: @test(
-; CHECK-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; CHECK-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; CHECK-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; CHECK-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; CHECK-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; CHECK-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; CHECK-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; CHECK-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP8:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 3>
 ; CHECK-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
 ; CHECK-NEXT:    ret void
 ;
 ; VECLIB-LABEL: @test(
-; VECLIB-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; VECLIB-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; VECLIB-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; VECLIB-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; VECLIB-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; VECLIB-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; VECLIB-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; VECLIB-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; VECLIB-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; VECLIB-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
-; VECLIB-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP2]])
-; VECLIB-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; VECLIB-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
+; VECLIB-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8
+; VECLIB-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+; VECLIB-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+; VECLIB-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+; VECLIB-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 2>
 ; VECLIB-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP5]])
-; VECLIB-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; VECLIB-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
-; VECLIB-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; VECLIB-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; VECLIB-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
+; VECLIB-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 1, i32 3>
+; VECLIB-NEXT:    [[TMP8:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP7]])
+; VECLIB-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 2>
+; VECLIB-NEXT:    [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]])
+; VECLIB-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 3>
 ; VECLIB-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; VECLIB-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; VECLIB-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
+; VECLIB-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]]
+; VECLIB-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]]
 ; VECLIB-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
 ; VECLIB-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
 ; VECLIB-NEXT:    ret void
 ;
 ; AMDLIBM-LABEL: @test(
-; AMDLIBM-NEXT:    [[A0:%.*]] = load double, ptr @src, align 8
-; AMDLIBM-NEXT:    [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
-; AMDLIBM-NEXT:    [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
-; AMDLIBM-NEXT:    [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
-; AMDLIBM-NEXT:    [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
-; AMDLIBM-NEXT:    [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
-; AMDLIBM-NEXT:    [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
-; AMDLIBM-NEXT:    [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
-; AMDLIBM-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0
-; AMDLIBM-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1
-; AMDLIBM-NEXT:    [[TMP3:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP2]])
-; AMDLIBM-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0
-; AMDLIBM-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1
+; AMDLIBM-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8
+; AMDLIBM-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+; AMDLIBM-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+; AMDLIBM-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+; AMDLIBM-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 2>
 ; AMDLIBM-NEXT:    [[TMP6:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP5]])
-; AMDLIBM-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; AMDLIBM-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1
-; AMDLIBM-NEXT:    [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]])
-; AMDLIBM-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0
-; AMDLIBM-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1
+; AMDLIBM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> <i32 1, i32 3>
+; AMDLIBM-NEXT:    [[TMP8:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP7]])
+; AMDLIBM-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 2>
+; AMDLIBM-NEXT:    [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]])
+; AMDLIBM-NEXT:    [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 3>
 ; AMDLIBM-NEXT:    [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]])
-; AMDLIBM-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]]
-; AMDLIBM-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]]
+; AMDLIBM-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]]
+; AMDLIBM-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]]
 ; AMDLIBM-NEXT:    [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]]
 ; AMDLIBM-NEXT:    store <2 x double> [[TMP15]], ptr @dst, align 8
 ; AMDLIBM-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
index 6ca1f8119c1cf..202ec9633712f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll
@@ -7,28 +7,18 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP1S(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4
-; CHECK-NEXT:    [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14
-; CHECK-NEXT:    [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4
-; CHECK-NEXT:    [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5
-; CHECK-NEXT:    [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> <i32 1, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]]
-; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[P]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> <i32 1, i32 7, i32 6, i32 4, i32 poison, i32 poison, i32 0, i32 5>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]]
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
index ba83ff096c9ac..9778218df6816 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll
@@ -4,20 +4,16 @@
 define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0
-; CHECK-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    store <8 x i32> [[TMP10]], ptr [[S:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[ADDR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x ptr> poison, ptr [[P:%.*]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x ptr> [[TMP2]], <16 x ptr> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <16 x ptr> [[TMP3]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP4]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    store <8 x i32> [[TMP8]], ptr [[S:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
index 87063fc3f7a82..69ae26b9f2585 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -7,19 +7,13 @@
 define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_add(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
@@ -54,19 +48,13 @@ entry:
 define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_addsub(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]]
 ; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
@@ -102,22 +90,16 @@ entry:
 define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) {
 ; ENABLED-LABEL: @test_supernode_addsub_alt(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
-; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
-; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
-; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
-; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
-; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
-; ENABLED-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
-; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
-; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]]
+; ENABLED-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]]
 ; ENABLED-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
 ; ENABLED-NEXT:    store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
@@ -177,19 +159,15 @@ entry:
 define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) {
 ; ENABLED-LABEL: @supernode_scheduling(
 ; ENABLED-NEXT:  entry:
-; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; ENABLED-NEXT:    [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8
 ; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8
-; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
-; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B1]], i32 1
+; ENABLED-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0
 ; ENABLED-NEXT:    [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]]
-; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0
-; ENABLED-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1
-; ENABLED-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]]
-; ENABLED-NEXT:    store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1
+; ENABLED-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]]
+; ENABLED-NEXT:    store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
index 243087c6d8d95..fd3c1a57aff34 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll
@@ -5,14 +5,12 @@
 define void @vec3_vectorize_call(ptr %Colour, float %0) {
 ; NON-POW2-LABEL: @vec3_vectorize_call(
 ; NON-POW2-NEXT:  entry:
-; NON-POW2-NEXT:    [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4
-; NON-POW2-NEXT:    [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1
-; NON-POW2-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4
-; NON-POW2-NEXT:    [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
-; NON-POW2-NEXT:    [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0
-; NON-POW2-NEXT:    [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1
-; NON-POW2-NEXT:    [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
-; NON-POW2-NEXT:    store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4
+; NON-POW2-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4
+; NON-POW2-NEXT:    [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2
+; NON-POW2-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 0, i32 1, i32 poison>
+; NON-POW2-NEXT:    [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> <i32 3, i32 4, i32 2>
+; NON-POW2-NEXT:    [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer)
+; NON-POW2-NEXT:    store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4
 ; NON-POW2-NEXT:    ret void
 ;
 ; POW2-ONLY-LABEL: @vec3_vectorize_call(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
index 7bcb2ece77921..f16d19fab2c9d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll
@@ -13,11 +13,9 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -27,10 +25,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -39,18 +33,26 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
index 82f8aa5f9be1b..001ab613a6d57 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll
@@ -13,11 +13,9 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -27,10 +25,6 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -39,18 +33,26 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T42:%.*]] = mul nsw i32 [[T17]], 16819
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
-; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> <i32 poison, i32 poison, i32 6270, i32 poison>, <4 x i32> [[TMP6]], <4 x i32> <i32 4, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 poison, i32 3>
+; CHECK-NEXT:    [[T71:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T34]], i32 6
 ; CHECK-NEXT:    [[T76:%.*]] = shl <8 x i32> [[T71]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    store <8 x i32> [[T76]], ptr [[T2]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
index 69ecf1852aedd..e52b29a7f681c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll
@@ -7,11 +7,9 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7
 ; CHECK-NEXT:    [[T5:%.*]] = load i32, ptr [[T4]], align 4
 ; CHECK-NEXT:    [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1
-; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
 ; CHECK-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6
 ; CHECK-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4
 ; CHECK-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2
-; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
 ; CHECK-NEXT:    [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5
 ; CHECK-NEXT:    [[T17:%.*]] = load i32, ptr [[T16]], align 4
 ; CHECK-NEXT:    [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3
@@ -21,11 +19,7 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T24:%.*]] = add nsw i32 [[T23]], [[T21]]
 ; CHECK-NEXT:    [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]]
 ; CHECK-NEXT:    [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]]
-; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
-; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
-; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
 ; CHECK-NEXT:    [[T32:%.*]] = mul nsw i32 [[T27]], 6270
-; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
 ; CHECK-NEXT:    [[T37:%.*]] = add nsw i32 [[T25]], [[T11]]
 ; CHECK-NEXT:    [[T38:%.*]] = add nsw i32 [[T17]], [[T5]]
 ; CHECK-NEXT:    [[T39:%.*]] = add nsw i32 [[T37]], [[T38]]
@@ -35,11 +29,17 @@ define void @test(ptr nocapture %t2) {
 ; CHECK-NEXT:    [[T47:%.*]] = mul nsw i32 [[T37]], -16069
 ; CHECK-NEXT:    [[T48:%.*]] = mul nsw i32 [[T38]], -3196
 ; CHECK-NEXT:    [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[T15:%.*]] = load i32, ptr [[T14]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T9:%.*]] = load i32, ptr [[T8]], align 4
+; CHECK-NEXT:    [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]]
+; CHECK-NEXT:    [[T30:%.*]] = add nsw i32 [[T27]], [[T29]]
+; CHECK-NEXT:    [[T31:%.*]] = mul nsw i32 [[T30]], 4433
+; CHECK-NEXT:    [[T34:%.*]] = mul nsw i32 [[T29]], -15137
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2
 ; CHECK-NEXT:    [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3