From ad7773547682f7f8e11ae04b6d06e7a960cb6ad1 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 12 Jun 2025 12:20:38 +0100
Subject: [PATCH 1/2] [IA] Remove recursive [de]interleaving support

Now that the loop vectorizer emits just a single llvm.vector.[de]interleaveN intrinsic, we can remove the need to recognise recursively [de]interleaved intrinsics.

No in-tree target currently has instructions to emit an interleaved access with a factor > 8, and I'm not aware of any other passes that will emit recursive interleave patterns, so this code is effectively dead.

Some tests have been converted from the recursive form to a single intrinsic, and some others were deleted that are no longer needed, e.g. to do with the recursive tree.

This closes off the work started in #139893.
---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 199 ++---------
 .../rvv/fixed-vectors-deinterleave-load.ll    |  67 ----
 .../rvv/fixed-vectors-interleave-store.ll     |  34 --
 .../rvv/fixed-vectors-interleaved-access.ll   |  14 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |  67 ----
 .../RISCV/rvv/vector-interleave-store.ll      |  34 --
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 314 ++++--------------
 .../AArch64/sve-deinterleave4.ll              |  90 ++---
 .../AArch64/sve-interleave4.ll                |  17 +-
 .../RISCV/interleaved-accesses.ll             | 196 -----------
 10 files changed, 127 insertions(+), 905 deletions(-)
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 49f1504d244ed..9c4c86cebe7e5 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -629,173 +629,12 @@ static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
   }
 }
 
-// For an (de)interleave tree like this:
-//
-//   A   C B   D
-//   |___| |___|
-//     |_____|
-//        |
-//     A B C D
-//
-//  We will get ABCD at the end while the leaf operands/results
-//  are ACBD, which are also what we initially collected in
-//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
-//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
-//  to reorder them by interleaving these values.
-static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
-  unsigned NumLeaves = SubLeaves.size();
-  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
-  if (NumLeaves == 2)
-    return;
-
-  const unsigned HalfLeaves = NumLeaves / 2;
-  // Visit the sub-trees.
-  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
-  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
-
-  SmallVector<Value *, 8> Buffer;
-  //    a0 a1 a2 a3 b0 b1 b2 b3
-  // -> a0 b0 a1 b1 a2 b2 a3 b3
-  for (unsigned i = 0U; i < NumLeaves; ++i)
-    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
-
-  llvm::copy(Buffer, SubLeaves.begin());
-}
-
-static bool
-getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
-                          SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(isInterleaveIntrinsic(II->getIntrinsicID()));
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-
-    // All the intermediate intrinsics will be deleted.
-    DeadInsts.push_back(Current);
-
-    for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) {
-      Value *Op = Current->getOperand(I);
-      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
-        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
-          Queue.push_back(OpII);
-          continue;
-        }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
-        return false;
-
-      Operands.push_back(Op);
-    }
-  }
-
-  const unsigned Factor = Operands.size();
-  // Currently we only recognize factors 2...8 and other powers of 2.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 ||
-      (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
-    return false;
-
-  // Recursively interleaved factors need to have their values reordered
-  // TODO: Remove once the loop vectorizer no longer recursively interleaves
-  // factors 4 + 8
-  if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
-    interleaveLeafValues(Operands);
-  return true;
-}
-
-static bool
-getVectorDeinterleaveFactor(IntrinsicInst *II,
-                            SmallVectorImpl<Value *> &Results,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(isDeinterleaveIntrinsic(II->getIntrinsicID()));
-  using namespace PatternMatch;
-  if (!II->hasNUses(getIntrinsicFactor(II)))
-    return false;
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-    assert(Current->hasNUses(getIntrinsicFactor(Current)));
-
-    // All the intermediate intrinsics will be deleted from the bottom-up.
-    DeadInsts.insert(DeadInsts.begin(), Current);
-
-    SmallVector<ExtractValueInst *> EVs(getIntrinsicFactor(Current), nullptr);
-    for (User *Usr : Current->users()) {
-      if (!isa<ExtractValueInst>(Usr))
-        return 0;
-
-      auto *EV = cast<ExtractValueInst>(Usr);
-      // Intermediate ExtractValue instructions will also be deleted.
-      DeadInsts.insert(DeadInsts.begin(), EV);
-      ArrayRef<unsigned> Indices = EV->getIndices();
-      if (Indices.size() != 1)
-        return false;
-
-      if (!EVs[Indices[0]])
-        EVs[Indices[0]] = EV;
-      else
-        return false;
-    }
-
-    // We have legal indices. At this point we're either going
-    // to continue the traversal or push the leaf values into Results.
-    for (ExtractValueInst *EV : EVs) {
-      // Continue the traversal. We're playing safe here and matching only the
-      // expression consisting of a perfectly balanced binary tree in which all
-      // intermediate values are only used once.
-      if (EV->hasOneUse() &&
-          match(EV->user_back(),
-                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
-          EV->user_back()->hasNUses(2)) {
-        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
-        Queue.push_back(EVUsr);
-        continue;
-      }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Results.empty() && EV->getType() != Results.back()->getType())
-        return false;
-
-      // Save the leaf value.
-      Results.push_back(EV);
-    }
-  }
-
-  const unsigned Factor = Results.size();
-  // Currently we only recognize factors of 2...8 and other powers of 2.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 ||
-      (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
-    return 0;
-
-  // Recursively interleaved factors need to have their values reordered
-  // TODO: Remove once the loop vectorizer no longer recursively interleaves
-  // factors 4 + 8
-  if (isPowerOf2_32(Factor) && getIntrinsicFactor(II) == 2)
-    interleaveLeafValues(Results);
-  return true;
-}
-
 static Value *getMask(Value *WideMask, unsigned Factor,
                       ElementCount LeafValueEC) {
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
-    SmallVector<Value *, 8> Operands;
-    SmallVector<Instruction *, 8> DeadInsts;
-    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
-      assert(!Operands.empty());
-      if (Operands.size() == Factor && llvm::all_equal(Operands))
-        return Operands[0];
+    if (isInterleaveIntrinsic(IMI->getIntrinsicID()) &&
+        getIntrinsicFactor(IMI) == Factor && llvm::all_equal(IMI->args())) {
+      return IMI->getArgOperand(0);
     }
   }
 
@@ -830,13 +669,19 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(LoadedVal))
     return false;
 
-  SmallVector<Value *, 8> DeinterleaveValues;
-  SmallVector<Instruction *, 8> DeinterleaveDeadInsts;
-  if (!getVectorDeinterleaveFactor(DI, DeinterleaveValues,
-                                   DeinterleaveDeadInsts))
+  const unsigned Factor = getIntrinsicFactor(DI);
+  if (!DI->hasNUses(Factor))
     return false;
-
-  const unsigned Factor = DeinterleaveValues.size();
+  SmallVector<Value *, 8> DeinterleaveValues(Factor);
+  for (auto *User : DI->users()) {
+    auto *Extract = dyn_cast<ExtractValueInst>(User);
+    if (!Extract || Extract->getNumIndices() != 1)
+      return false;
+    unsigned Idx = Extract->getIndices()[0];
+    if (DeinterleaveValues[Idx])
+      return false;
+    DeinterleaveValues[Idx] = Extract;
+  }
 
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
@@ -869,7 +714,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
   }
 
-  DeadInsts.insert_range(DeinterleaveDeadInsts);
+  for (Value *V : DeinterleaveValues)
+    DeadInsts.insert(cast<Instruction>(V));
+  DeadInsts.insert(DI);
   // We now have a target-specific load, so delete the old one.
   DeadInsts.insert(cast<Instruction>(LoadedVal));
   return true;
@@ -883,12 +730,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
   if (!isa<StoreInst, VPIntrinsic>(StoredBy))
     return false;
 
-  SmallVector<Value *, 8> InterleaveValues;
-  SmallVector<Instruction *, 8> InterleaveDeadInsts;
-  if (!getVectorInterleaveFactor(II, InterleaveValues, InterleaveDeadInsts))
-    return false;
-
-  const unsigned Factor = InterleaveValues.size();
+  SmallVector<Value *, 8> InterleaveValues(II->args());
+  const unsigned Factor = getIntrinsicFactor(II);
 
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoredBy)) {
     if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
@@ -922,7 +765,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
   // We now have a target-specific store, so delete the old one.
   DeadInsts.insert(cast<Instruction>(StoredBy));
-  DeadInsts.insert_range(InterleaveDeadInsts);
+  DeadInsts.insert(II);
   return true;
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index c2ae1ce491389..3e822d357b667 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -293,31 +293,6 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
-; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <32 x i8>, ptr %p
-  %d0 = call {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8> %vec)
-  %d0.0 = extractvalue { <16 x i8>, <16 x i8> } %d0, 0
-  %d0.1 = extractvalue { <16 x i8>, <16 x i8> } %d0, 1
-  %d1 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.0)
-  %t0 = extractvalue { <8 x i8>, <8 x i8> } %d1, 0
-  %t2 = extractvalue { <8 x i8>, <8 x i8> } %d1, 1
-  %d2 = call {<8 x i8>, <8 x i8>} @llvm.vector.deinterleave2.v16i8(<16 x i8> %d0.1)
-  %t1 = extractvalue { <8 x i8>, <8 x i8> } %d2, 0
-  %t3 = extractvalue { <8 x i8>, <8 x i8> } %d2, 1
-
-  %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } undef, <8 x i8> %t0, 0
-  %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
-  %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
-  %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
-  ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
-}
-
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor5:
 ; CHECK:       # %bb.0:
@@ -414,45 +389,3 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <
   %res7 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6, <8 x i8> %t6, 7
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res7
 }
-
-; TODO: Remove once recursive deinterleaving support is removed
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
-; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <16 x i32>, ptr %ptr
-  %d0 = call { <8 x i32>, <8 x i32> } @llvm.vector.deinterleave2.v16i32(<16 x i32> %vec)
-  %d0.0 = extractvalue { <8 x i32>, <8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <8 x i32>, <8 x i32> } %d0, 1
-  %d1 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.0)
-  %d1.0 = extractvalue { <4 x i32>, <4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <4 x i32>, <4 x i32> } %d1, 1
-  %d2 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %d0.1)
-  %d2.0 = extractvalue { <4 x i32>, <4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <4 x i32>, <4 x i32> } %d2, 1
-
-  %d3 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d3, 0
-  %t4 = extractvalue { <2 x i32>, <2 x i32> } %d3, 1
-  %d4 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d1.1)
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d4, 0
-  %t6 = extractvalue { <2 x i32>, <2 x i32> } %d4, 1
-  %d5 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.0)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d5, 0
-  %t5 = extractvalue { <2 x i32>, <2 x i32> } %d5, 1
-  %d6 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d2.1)
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d6, 0
-  %t7 = extractvalue { <2 x i32>, <2 x i32> } %d6, 1
-
-  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } undef, <2 x i32> %t0, 0
-  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
-  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
-  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
-  %res4 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3, <2 x i32> %t4, 4
-  %res5 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res4, <2 x i32> %t5, 5
-  %res6 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res5, <2 x i32> %t6, 6
-  %res7 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res6, <2 x i32> %t7, 7
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res7
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
index c394e7aa2e3e8..a49eeed3605c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll
@@ -203,20 +203,6 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor4_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %c)
-  %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %d)
-  %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1)
-  store <16 x i32> %v2, ptr %p
-  ret void
-}
-
 define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_factor5:
 ; CHECK:       # %bb.0:
@@ -260,23 +246,3 @@ define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i3
   store <32 x i32> %v, ptr %p
   ret void
 }
-
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor8_recursive(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %e)
-  %v1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %c, <4 x i32> %g)
-  %v2 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v0, <8 x i32> %v1)
-
-  %v3 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %b, <4 x i32> %f)
-  %v4 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> %d, <4 x i32> %h)
-  %v5 = call <16 x i32> @llvm.vector.interleave2.v16i32(<8 x i32> %v3, <8 x i32> %v4)
-
-  %v6 = call <32 x i32> @llvm.vector.interleave2.v32i32(<16 x i32> %v2, <16 x i32> %v5)
-  store <32 x i32> %v6, ptr %p
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 8ac4c7447c7d4..5e3ae2faf1a53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -302,15 +302,11 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(p
 ; CHECK-NEXT:    vlseg4e32.v v8, (a0)
 ; CHECK-NEXT:    ret
   %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
-  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
-  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
-  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+  %d = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.vector.deinterleave4.v8i32(<8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 0
+  %t1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 1
+  %t2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 2
+  %t3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %d, 3
 
   %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
   %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 9344c52098684..b11db3d61f693 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -380,31 +380,6 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor4_recursive(ptr %p) {
-; CHECK-LABEL: vector_deinterleave_load_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vlseg4e8.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <vscale x 32 x i8>, ptr %p
-  %d0 = call {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %vec)
-  %d0.0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %d0, 1
-  %d1 = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %d0.0)
-  %t0 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d1, 0
-  %t2 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d1, 1
-  %d2 = call {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %d0.1)
-  %t1 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d2, 0
-  %t3 = extractvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } undef, <vscale x 8 x i8> %t0, 0
-  %res1 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res0, <vscale x 8 x i8> %t1, 1
-  %res2 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res1, <vscale x 8 x i8> %t2, 2
-  %res3 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res2, <vscale x 8 x i8> %t3, 3
-  ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res3
-}
-
 define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
 ; CHECK-LABEL: vector_deinterleave_load_factor5:
 ; CHECK:       # %bb.0:
@@ -500,45 +475,3 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
   %res7 = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res6, <vscale x 8 x i8> %t7, 7
   ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } %res7
 }
-
-; TODO: Remove once recursive deinterleaving support is removed
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_load_factor8_recursive(ptr %ptr) {
-; CHECK-LABEL: vector_deinterleave_load_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } undef, <vscale x 2 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
-  %res4 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3, <vscale x 2 x i32> %t4, 4
-  %res5 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res4, <vscale x 2 x i32> %t5, 5
-  %res6 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res5, <vscale x 2 x i32> %t6, 6
-  %res7 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res6, <vscale x 2 x i32> %t7, 7
-  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res7
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index 3751967f18aa4..f0cbf6a006919 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -261,20 +261,6 @@ define void @vector_interleave_store_factor4(<vscale x 2 x i32> %a, <vscale x 2
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor4_recursive(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor4_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %v1 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %v2 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %v0, <vscale x 8 x i32> %v1)
-  store <vscale x 16 x i32> %v2, ptr %p
-  ret void
-}
-
 define void @vector_interleave_store_factor5(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, ptr %p) {
 ; CHECK-LABEL: vector_interleave_store_factor5:
 ; CHECK:       # %bb.0:
@@ -318,23 +304,3 @@ define void @vector_interleave_store_factor8(<vscale x 2 x i32> %a, <vscale x 2
   store <vscale x 16 x i32> %v, ptr %p
   ret void
 }
-
-; TODO: Remove once recursive interleaving support is removed
-define void @vector_interleave_store_factor8_recursive(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c, <vscale x 2 x i32> %d, <vscale x 2 x i32> %e, <vscale x 2 x i32> %f, <vscale x 2 x i32> %g, <vscale x 2 x i32> %h, ptr %p) {
-; CHECK-LABEL: vector_interleave_store_factor8_recursive:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg8e32.v v8, (a0)
-; CHECK-NEXT:    ret
-  %v0 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %e)
-  %v1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %c, <vscale x 2 x i32> %g)
-  %v2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %v0, <vscale x 4 x i32> %v1)
-
-  %v3 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %b, <vscale x 2 x i32> %f)
-  %v4 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %d, <vscale x 2 x i32> %h)
-  %v5 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %v3, <vscale x 4 x i32> %v4)
-
-  %v6 = call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %v2, <vscale x 8 x i32> %v5)
-  store <vscale x 16 x i32> %v6, ptr %p
-  ret void
-}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 142ee5256f9e7..4e21fcf85c2c8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -84,15 +84,11 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -214,28 +210,15 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
   %wide.masked.load = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr %ptr, <vscale x 16 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave8.nxv16i32(<vscale x 16 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
+  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 4
+  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 5
+  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 6
+  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 7
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -323,10 +306,8 @@ define void @store_factor4_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsseg4e32.v v8, (a0)
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
+  %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -430,14 +411,8 @@ define void @store_factor8_v2(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, pt
 ; RV64-NEXT:    vsseg8e32.v v8, (a0)
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 8
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  %interleaved.vec3 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec4 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec5 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec3, <vscale x 2 x i32> %interleaved.vec4)
-  %interleaved.vec6 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %interleaved.vec2, <vscale x 4 x i32> %interleaved.vec5)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec6, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
+  %interleaved.vec = call <vscale x 8 x i32> @llvm.vector.interleave8.nxv8i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
   ret void
 }
 
@@ -485,19 +460,13 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2
 ; RV64-NEXT:    vlseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
-  %interleaved.mask0 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
-  %interleaved.mask1 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
-  %interleaved.mask2 = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %interleaved.mask0, <vscale x 4 x i1> %interleaved.mask1)
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask2, i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
+  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> %interleaved.mask, i32 %rvl)
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue  { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
@@ -677,181 +646,14 @@ define void @masked_store_factor4_v2(<vscale x 1 x i1> %mask, <vscale x 1 x i32>
 ; RV64-NEXT:    vsseg4e32.v v8, (a0), v0.t
 ; RV64-NEXT:    ret
   %rvl = mul i32 %evl, 4
-  %interleaved.mask0 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
-  %interleaved.mask1 = call <vscale x 2 x i1> @llvm.vector.interleave2.nxv2i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
-  %interleaved.mask2 = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %interleaved.mask0, <vscale x 2 x i1> %interleaved.mask1)
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v1, <vscale x 1 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %interleaved.vec1)
-  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec2, ptr %ptr, <vscale x 4 x i1> %interleaved.mask2, i32 %rvl)
+  %interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave4.nxv4i1(<vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask, <vscale x 1 x i1> %mask)
+  %interleaved.vec = call <vscale x 4 x i32> @llvm.vector.interleave4.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1, <vscale x 1 x i32> %v0, <vscale x 1 x i32> %v1)
+  call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %interleaved.vec, ptr %ptr, <vscale x 4 x i1> %interleaved.mask, i32 %rvl)
   ret void
 }
 
 ; Negative tests
 
-; We should not transform this function because the deinterleave tree is not in a desired form.
-define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @incorrect_extract_value_index(ptr %ptr, i32 %evl) {
-; RV32-LABEL: incorrect_extract_value_index:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wi v12, v8, 0
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v9, v12, a0
-; RV32-NEXT:    vnsrl.wi v8, v12, 0
-; RV32-NEXT:    vmv.v.v v10, v9
-; RV32-NEXT:    vmv.v.v v11, v9
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: incorrect_extract_value_index:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wi v12, v8, 0
-; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v9, v12, a0
-; RV64-NEXT:    vnsrl.wi v8, v12, 0
-; RV64-NEXT:    vmv.v.v v10, v9
-; RV64-NEXT:    vmv.v.v v11, v9
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res1, <vscale x 2 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res2, <vscale x 2 x i32> %t3, 3
-  ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res3
-}
-
-; We should not transform this function because the expression is not a balanced tree.
-define {<vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>} @not_balanced_load_tree(ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_balanced_load_tree:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vle32.v v12, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wx v8, v12, a0
-; RV32-NEXT:    vnsrl.wi v16, v12, 0
-; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wi v10, v16, 0
-; RV32-NEXT:    vnsrl.wx v11, v16, a0
-; RV32-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v11, a0
-; RV32-NEXT:    vnsrl.wi v11, v11, 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_balanced_load_tree:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vle32.v v12, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wx v8, v12, a0
-; RV64-NEXT:    vnsrl.wi v16, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wi v10, v16, 0
-; RV64-NEXT:    vnsrl.wx v11, v16, a0
-; RV64-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v11, a0
-; RV64-NEXT:    vnsrl.wi v11, v11, 0
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 1 x i32>, <vscale x 1 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 2 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } %d2, 1
-
-  %res0 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 4 x i32> %t0, 0
-  %res1 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res0, <vscale x 2 x i32> %t1, 1
-  %res2 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res1, <vscale x 1 x i32> %t2, 2
-  %res3 = insertvalue { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res2, <vscale x 1 x i32> %t3, 3
-  ret { <vscale x 4 x i32>, <vscale x 2 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } %res3
-}
-
-define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32> %v1, <vscale x 4 x i32> %v2, ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_balanced_store_tree:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vwaddu.vv v12, v8, v8
-; RV32-NEXT:    li a2, -1
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vwmaccu.vx v12, a2, v8
-; RV32-NEXT:    srli a3, a3, 3
-; RV32-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; RV32-NEXT:    vslidedown.vx v8, v12, a3
-; RV32-NEXT:    add a4, a3, a3
-; RV32-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; RV32-NEXT:    vslideup.vx v12, v8, a3
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV32-NEXT:    vwaddu.vv v16, v12, v9
-; RV32-NEXT:    vwmaccu.vx v16, a2, v9
-; RV32-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV32-NEXT:    vwaddu.vv v12, v16, v10
-; RV32-NEXT:    vwmaccu.vx v12, a2, v10
-; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v12, (a0)
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_balanced_store_tree:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vwaddu.vv v12, v8, v8
-; RV64-NEXT:    li a2, -1
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    vwmaccu.vx v12, a2, v8
-; RV64-NEXT:    srli a3, a3, 3
-; RV64-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; RV64-NEXT:    vslidedown.vx v8, v12, a3
-; RV64-NEXT:    add a4, a3, a3
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; RV64-NEXT:    vslideup.vx v12, v8, a3
-; RV64-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
-; RV64-NEXT:    vwaddu.vv v16, v12, v9
-; RV64-NEXT:    vwmaccu.vx v16, a2, v9
-; RV64-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
-; RV64-NEXT:    vwaddu.vv v12, v16, v10
-; RV64-NEXT:    vwmaccu.vx v12, a2, v10
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
-; RV64-NEXT:    vse32.v v12, (a0)
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %interleaved.vec0 = call <vscale x 2 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 1 x i32> %v0, <vscale x 1 x i32> %v0)
-  %interleaved.vec1 = call <vscale x 4 x i32> @llvm.vector.interleave2.nxv2i32(<vscale x 2 x i32> %interleaved.vec0, <vscale x 2 x i32> %v1)
-  %interleaved.vec2 = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv4i32(<vscale x 4 x i32> %interleaved.vec1, <vscale x 4 x i32> %v2)
-  call void @llvm.vp.store.nxv8i32.p0(<vscale x 8 x i32> %interleaved.vec2, ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  ret void
-}
-
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_same_mask:
 ; RV32:       # %bb.0:
@@ -943,48 +745,58 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 define {<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>} @invalid_evl(ptr %ptr, i32 %evl) {
 ; RV32-LABEL: invalid_evl:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    sub sp, sp, a2
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; RV32-NEXT:    ori a1, a1, 1
 ; RV32-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v8, a0
-; RV32-NEXT:    vnsrl.wi v14, v8, 0
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs4r.v v8, (a0)
 ; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v10, v14, a0
-; RV32-NEXT:    vnsrl.wi v8, v14, 0
-; RV32-NEXT:    vnsrl.wx v11, v12, a0
-; RV32-NEXT:    vnsrl.wi v9, v12, 0
+; RV32-NEXT:    vlseg4e32.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 2
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: invalid_evl:
 ; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    slli a2, a2, 2
+; RV64-NEXT:    sub sp, sp, a2
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
 ; RV64-NEXT:    ori a1, a1, 1
 ; RV64-NEXT:    slli a1, a1, 32
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v8, a0
-; RV64-NEXT:    vnsrl.wi v14, v8, 0
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs4r.v v8, (a0)
 ; RV64-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v10, v14, a0
-; RV64-NEXT:    vnsrl.wi v8, v14, 0
-; RV64-NEXT:    vnsrl.wx v11, v12, a0
-; RV64-NEXT:    vnsrl.wi v9, v12, 0
+; RV64-NEXT:    vlseg4e32.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 2
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 16
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
   %rvl = or i32 %evl, 1
   %wide.masked.load = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p0(ptr %ptr, <vscale x 8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d0, 1
-  %d1 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 0
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d1, 1
-  %d2 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d0.1)
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d2, 1
+  %d = call { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave4.nxv8i32(<vscale x 8 x i32> %wide.masked.load)
+  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 0
+  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 1
+  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 2
+  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %d, 3
 
   %res0 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> %t0, 0
   %res1 = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } %res0, <vscale x 2 x i32> %t1, 1
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
index 61a68692ff5b9..c565066541d1d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -16,17 +16,13 @@ define void @deinterleave4(ptr %src) {
 ;
 
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
-  %sum = add <vscale x 4 x i32> %5, %7
-  %sub = sub <vscale x 4 x i32> %6, %8
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 3
+  %sum = add <vscale x 4 x i32> %1, %2
+  %sub = sub <vscale x 4 x i32> %3, %4
   ret void
 }
 
@@ -58,17 +54,13 @@ define void @wide_deinterleave4(ptr %src) {
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 32 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
-  %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
-  %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
-  %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
-  %sum = add <vscale x 8 x i32> %5, %7
-  %sub = sub <vscale x 8 x i32> %6, %8
+  %deinterleave = tail call { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave4.nxv32i32(<vscale x 32 x i32> %load)
+  %1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave, 3
+  %sum = add <vscale x 8 x i32> %1, %2
+  %sub = sub <vscale x 8 x i32> %3, %4
   ret void
 }
 
@@ -87,52 +79,36 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) {
 ;
 
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
-  %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 3
 
   %load2 = load <vscale x 8 x i32>, ptr %src, align 4
-  %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
-  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
-  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+  %deinterleave2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
+  %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave2, 0
+  %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave2, 1
   ret void
 }
 
 define void @negative_deinterleave4_test(ptr %src) {
 ; CHECK-LABEL: define void @negative_deinterleave4_test
 ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
-; CHECK-NEXT:    [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 2
-; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> splat (i1 true), ptr [[TMP6]])
-; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
-; CHECK-NEXT:    [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[DEINTERLEAVE:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[LOAD]])
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE]], 2
 ; CHECK-NEXT:    ret void
 ;
   %load = load <vscale x 16 x i32>, ptr %src, align 4
-  %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
-  %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
-  %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
-  %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
-  %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
-  %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
-  %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+  %deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %load)
+  %1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+  %2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+  %3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
+  %4 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 2
 
   ret void
 }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
index 085089978d8f5..a61db6577d56d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -8,9 +8,7 @@ define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> splat (i1 true), ptr [[DST]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
   store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
   ret void
 }
@@ -32,9 +30,7 @@ define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> splat (i1 true), ptr [[TMP6]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
-  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave4.nxv32i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d)
   store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
   ret void
 }
@@ -46,9 +42,7 @@ define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> splat (i1 true), ptr [[DST2]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
-  %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
-  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+  %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
   store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
 
   %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
@@ -64,8 +58,7 @@ define void @duplicate_by_interleave(<vscale x 4 x i32> %A, <vscale x 4 x i32> %
 ; CHECK-NEXT:    call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[B]], <vscale x 4 x i1> splat (i1 true), ptr [[AB_DUPLICATE]])
 ; CHECK-NEXT:    ret void
 ;
-  %interleave = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %A, <vscale x 4 x i32> %B)
-  %duplicate_by_interleave = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleave, <vscale x 8 x i32> %interleave)
-  store <vscale x 16 x i32> %duplicate_by_interleave, ptr %AB_duplicate, align 4
+  %interleave = tail call <vscale x 16 x i32> @llvm.vector.interleave4.nxv16i32(<vscale x 4 x i32> %A, <vscale x 4 x i32> %A, <vscale x 4 x i32> %B, <vscale x 4 x i32> %B)
+  store <vscale x 16 x i32> %interleave, ptr %AB_duplicate, align 4
   ret void
 }
diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
index 87b16d17aa5f0..72c1f22032bb7 100644
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@@ -175,53 +175,6 @@ define void @load_factor4_vscale(ptr %ptr) {
   ret void
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define void @load_factor4_vscale_recursive(ptr %ptr) {
-; RV32-LABEL: @load_factor4_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
-; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
-; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
-; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
-; RV32-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
-; RV32-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
-; RV32-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
-; RV32-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
-; RV32-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
-; RV32-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
-; RV32-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
-; RV32-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
-; RV32-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @load_factor4_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 16 x i8>, 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv16i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
-; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 0)
-; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[TMP2]], 0
-; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 1)
-; RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]], <vscale x 4 x i32> [[TMP4]], 1
-; RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 2)
-; RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP5]], <vscale x 4 x i32> [[TMP6]], 2
-; RV64-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.riscv.tuple.extract.nxv4i32.triscv.vector.tuple_nxv16i8_4t(target("riscv.vector.tuple", <vscale x 16 x i8>, 4) [[TMP1]], i32 3)
-; RV64-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP7]], <vscale x 4 x i32> [[TMP8]], 3
-; RV64-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 0
-; RV64-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 1
-; RV64-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 2
-; RV64-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP9]], 3
-; RV64-NEXT:    ret void
-;
-  %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %t0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %t1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %t2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %t3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-  ret void
-}
-
 define void @load_factor5(ptr %ptr) {
 ; RV32-LABEL: @load_factor5(
 ; RV32-NEXT:    [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.mask.v4i32.p0.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4)
@@ -590,91 +543,6 @@ define void @load_factor8_vscale(ptr %ptr) {
   ret void
 }
 
-; TODO: Remove once recursive deinterleaving support is removed
-define void @load_factor8_vscale_recursive(ptr %ptr) {
-; RV32-LABEL: @load_factor8_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i32 -1, i32 5)
-; RV32-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
-; RV32-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
-; RV32-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
-; RV32-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
-; RV32-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
-; RV32-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
-; RV32-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
-; RV32-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
-; RV32-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
-; RV32-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
-; RV32-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
-; RV32-NEXT:    [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
-; RV32-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
-; RV32-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
-; RV32-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
-; RV32-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
-; RV32-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
-; RV32-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
-; RV32-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
-; RV32-NEXT:    [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
-; RV32-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
-; RV32-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
-; RV32-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
-; RV32-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @load_factor8_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, ptr [[PTR:%.*]], i64 -1, i64 5)
-; RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 0)
-; RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP2]], 0
-; RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 1)
-; RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[TMP4]], 1
-; RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 2)
-; RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[TMP6]], 2
-; RV64-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 3)
-; RV64-NEXT:    [[TMP9:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]], <vscale x 2 x i32> [[TMP8]], 3
-; RV64-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 4)
-; RV64-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP9]], <vscale x 2 x i32> [[TMP10]], 4
-; RV64-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 5)
-; RV64-NEXT:    [[TMP13:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP11]], <vscale x 2 x i32> [[TMP12]], 5
-; RV64-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 6)
-; RV64-NEXT:    [[TMP15:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP13]], <vscale x 2 x i32> [[TMP14]], 6
-; RV64-NEXT:    [[TMP16:%.*]] = call <vscale x 2 x i32> @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_8t(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], i32 7)
-; RV64-NEXT:    [[TMP17:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP15]], <vscale x 2 x i32> [[TMP16]], 7
-; RV64-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 0
-; RV64-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 1
-; RV64-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 2
-; RV64-NEXT:    [[TMP21:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 3
-; RV64-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 4
-; RV64-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 5
-; RV64-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 6
-; RV64-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP17]], 7
-; RV64-NEXT:    ret void
-;
-  %interleaved.vec = load <vscale x 16 x i32>, ptr %ptr
-  %d0 = call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %interleaved.vec)
-  %d0.0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 0
-  %d0.1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d0, 1
-
-  %d1 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.0)
-  %d1.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 0
-  %d1.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d1, 1
-  %d2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %d0.1)
-  %d2.0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 0
-  %d2.1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %d2, 1
-
-  %d3 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.0)
-  %t0 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 0
-  %t1 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d3, 1
-  %d4 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d1.1)
-  %t2 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 0
-  %t3 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d4, 1
-  %d5 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.0)
-  %t4 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 0
-  %t5 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d5, 1
-  %d6 = call { <vscale x 2 x i32>, <vscale x 2 x i32> } @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %d2.1)
-  %t6 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 0
-  %t7 = extractvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } %d6, 1
-  ret void
-}
-
 
 define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) {
 ; RV32-LABEL: @store_factor2(
@@ -808,31 +676,6 @@ define void @store_factor4_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @store_factor4_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1) {
-; RV32-LABEL: @store_factor4_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
-; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
-; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V1]], i32 3)
-; RV32-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i32 -1, i32 3)
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @store_factor4_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP1]], <vscale x 8 x i8> [[V0]], i32 1)
-; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP2]], <vscale x 8 x i8> [[V1:%.*]], i32 2)
-; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 4) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP3]], <vscale x 8 x i8> [[V1]], i32 3)
-; RV64-NEXT:    call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv8i8_4t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 4) [[TMP4]], ptr [[PTR:%.*]], i64 -1, i64 3)
-; RV64-NEXT:    ret void
-;
-  %i0 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i1 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i2 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i0, <vscale x 16 x i8> %i1)
-  store <vscale x 32 x i8> %i2, ptr %ptr, align 4
-  ret void
-}
-
 define void @store_factor5_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3, <vscale x 8 x i8> %v4) {
 ; RV32-LABEL: @store_factor5_vscale(
 ; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 5) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
@@ -1013,45 +856,6 @@ define void @store_factor8_vscale(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x
   ret void
 }
 
-; TODO: Remove once recursive interleaving support is removed
-define void @store_factor8_vscale_recursive(ptr %ptr, <vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1, <vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3) {
-; RV32-LABEL: @store_factor8_vscale_recursive(
-; RV32-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV32-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
-; RV32-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
-; RV32-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V2]], i32 3)
-; RV32-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V1:%.*]], i32 4)
-; RV32-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V3:%.*]], i32 5)
-; RV32-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V1]], i32 6)
-; RV32-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V3]], i32 7)
-; RV32-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i32(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i32 -1, i32 3)
-; RV32-NEXT:    ret void
-;
-; RV64-LABEL: @store_factor8_vscale_recursive(
-; RV64-NEXT:    [[TMP1:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) poison, <vscale x 8 x i8> [[V0:%.*]], i32 0)
-; RV64-NEXT:    [[TMP2:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP1]], <vscale x 8 x i8> [[V2:%.*]], i32 1)
-; RV64-NEXT:    [[TMP3:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP2]], <vscale x 8 x i8> [[V0]], i32 2)
-; RV64-NEXT:    [[TMP4:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP3]], <vscale x 8 x i8> [[V2]], i32 3)
-; RV64-NEXT:    [[TMP5:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP4]], <vscale x 8 x i8> [[V1:%.*]], i32 4)
-; RV64-NEXT:    [[TMP6:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP5]], <vscale x 8 x i8> [[V3:%.*]], i32 5)
-; RV64-NEXT:    [[TMP7:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP6]], <vscale x 8 x i8> [[V1]], i32 6)
-; RV64-NEXT:    [[TMP8:%.*]] = call target("riscv.vector.tuple", <vscale x 8 x i8>, 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP7]], <vscale x 8 x i8> [[V3]], i32 7)
-; RV64-NEXT:    call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv8i8_8t.p0.i64(target("riscv.vector.tuple", <vscale x 8 x i8>, 8) [[TMP8]], ptr [[PTR:%.*]], i64 -1, i64 3)
-; RV64-NEXT:    ret void
-;
-  %i0 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i1 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v0, <vscale x 8 x i8> %v1)
-  %i2 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i0, <vscale x 16 x i8> %i1)
-
-  %i3 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
-  %i4 = call <vscale x 16 x i8> @llvm.vector.interleave2.nxv8i8(<vscale x 8 x i8> %v2, <vscale x 8 x i8> %v3)
-  %i5 = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv16i8(<vscale x 16 x i8> %i3, <vscale x 16 x i8> %i4)
-
-  %i6 = call <vscale x 64 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 32 x i8> %i2, <vscale x 32 x i8> %i5)
-  store <vscale x 64 x i8> %i6, ptr %ptr, align 4
-  ret void
-}
-
 define void @load_factor2_fp128(ptr %ptr) {
 ; RV32-LABEL: @load_factor2_fp128(
 ; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16

From 83e1cd76fa14b66e67d0c5057d59770c9d230385 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 25 Jun 2025 11:18:40 +0100
Subject: [PATCH 2/2] Update test

---
 .../RISCV/rvv/vp-vector-interleaved-access.ll | 94 +++++++++----------
 1 file changed, 44 insertions(+), 50 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 4e21fcf85c2c8..35f01f608b56e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -536,38 +536,37 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v8, v0
-; RV32-NEXT:    slli a2, a1, 1
 ; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v11, v11
-; RV32-NEXT:    vwmaccu.vx v12, a1, v11
+; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v11, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v11, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v11, 0
-; RV32-NEXT:    add a1, a3, a3
+; RV32-NEXT:    slli a3, a1, 1
 ; RV32-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a3
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; RV32-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v9, a2
+; RV32-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
 ; RV32-NEXT:    li a1, 32
-; RV32-NEXT:    vsetvli a3, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vnsrl.wx v13, v10, a1
 ; RV32-NEXT:    vmv.x.s a1, v10
 ; RV32-NEXT:    vnsrl.wi v12, v10, 0
-; RV32-NEXT:    srli a2, a2, 1
+; RV32-NEXT:    srli a3, a3, 1
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV32-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:    ret
@@ -580,26 +579,24 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a4, a1, 33
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v9, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v11, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v11, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v11, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v11, 0
-; RV64-NEXT:    add a1, a3, a3
+; RV64-NEXT:    slli a3, a1, 33
 ; RV64-NEXT:    vmerge.vim v9, v9, 1, v0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v9, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
-; RV64-NEXT:    srli a1, a4, 32
+; RV64-NEXT:    srli a1, a3, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
 ; RV64-NEXT:    vle32.v v10, (a0), v0.t
 ; RV64-NEXT:    li a1, 32
@@ -607,9 +604,9 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract(<vscale x 2 x i1> %
 ; RV64-NEXT:    vnsrl.wx v13, v10, a1
 ; RV64-NEXT:    vmv.x.s a1, v10
 ; RV64-NEXT:    vnsrl.wi v12, v10, 0
-; RV64-NEXT:    srli a4, a4, 33
+; RV64-NEXT:    srli a3, a3, 33
 ; RV64-NEXT:    vmv1r.v v0, v8
-; RV64-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, a3, e32, m1, ta, ma
 ; RV64-NEXT:    vsseg2e32.v v12, (a0), v0.t
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:    ret
@@ -660,29 +657,28 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmv1r.v v9, v0
 ; RV32-NEXT:    vmv1r.v v0, v8
-; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmv.v.i v8, 0
 ; RV32-NEXT:    li a2, -1
 ; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV32-NEXT:    vmv.v.i v10, 0
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV32-NEXT:    vmv1r.v v0, v9
 ; RV32-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV32-NEXT:    srli a3, a3, 2
 ; RV32-NEXT:    vwaddu.vv v12, v9, v11
 ; RV32-NEXT:    vwmaccu.vx v12, a2, v11
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    srli a2, a2, 2
 ; RV32-NEXT:    vmsne.vi v0, v12, 0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV32-NEXT:    vslidedown.vx v9, v12, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslidedown.vx v9, v12, a2
 ; RV32-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV32-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v9, 0
-; RV32-NEXT:    add a2, a3, a3
+; RV32-NEXT:    slli a1, a1, 1
 ; RV32-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV32-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v8, a3
+; RV32-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v8, a2
 ; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v10, 0
 ; RV32-NEXT:    vle32.v v10, (a0), v0.t
@@ -701,26 +697,24 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1>
 ; RV64-NEXT:    li a2, -1
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; RV64-NEXT:    vmv.v.i v10, 0
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    slli a1, a1, 33
-; RV64-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmerge.vim v11, v8, 1, v0
 ; RV64-NEXT:    vmv1r.v v0, v9
 ; RV64-NEXT:    vmerge.vim v9, v8, 1, v0
-; RV64-NEXT:    srli a3, a3, 2
 ; RV64-NEXT:    vwaddu.vv v12, v9, v11
 ; RV64-NEXT:    vwmaccu.vx v12, a2, v11
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    srli a2, a2, 2
 ; RV64-NEXT:    vmsne.vi v0, v12, 0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
-; RV64-NEXT:    vslidedown.vx v9, v12, a3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslidedown.vx v9, v12, a2
 ; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v9, 0
-; RV64-NEXT:    add a2, a3, a3
+; RV64-NEXT:    slli a1, a1, 33
 ; RV64-NEXT:    vmerge.vim v8, v8, 1, v0
-; RV64-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v8, a3
-; RV64-NEXT:    vsetvli a2, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v8, a2
 ; RV64-NEXT:    vmsne.vi v0, v10, 0
 ; RV64-NEXT:    srli a1, a1, 32
 ; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma