From 69389a32fec23647acce371d2e5bc3740c0dcd6b Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 11 Apr 2025 11:45:19 -0700
Subject: [PATCH 01/12] [IA][RISCV] Add support for vp.load/vp.store with
 shufflevector

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   9 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 177 +++++--
 .../Target/AArch64/AArch64ISelLowering.cpp    |  12 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  12 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  79 +++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   4 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 +-
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |  12 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 443 ++++++++++++++++--
 11 files changed, 661 insertions(+), 99 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..5407bf8b2ba13 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p LI is the vector load instruction.
+  /// \p LoadOp is a vector load or vp.load instruction.
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedLoad(LoadInst *LI,
+  virtual bool lowerInterleavedLoad(Instruction *LoadOp,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices,
                                     unsigned Factor) const {
@@ -3171,10 +3171,11 @@ class TargetLoweringBase {
   /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p SI is the vector store instruction.
+  /// \p StoreOp is a vector store or vp.store instruction.
   /// \p SVI is the shufflevector to RE-interleave the stored vector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  virtual bool lowerInterleavedStore(Instruction *StoreOp,
+                                     ShuffleVectorInst *SVI,
                                      unsigned Factor) const {
     return false;
   }
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..83bde96cc725a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -100,11 +101,11 @@ class InterleavedAccessImpl {
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(StoreInst *SI,
+  bool lowerInterleavedStore(Instruction *StoreOp,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +132,7 @@ class InterleavedAccessImpl {
   /// made.
   bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
                             SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
-                            LoadInst *LI);
+                            Instruction *LI);
 };
 
 class InterleavedAccess : public FunctionPass {
@@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
 }
 
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+    Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(LoadOp->getType()))
     return false;
 
+  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+    if (!LI->isSimple())
+      return false;
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    // Require a constant mask and evl.
+    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
+        !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+      return false;
+  } else {
+    llvm_unreachable("unsupported load operation");
+  }
+
   // Check if all users of this load are shufflevectors. If we encounter any
   // users that are extractelement instructions or binary operators, we save
   // them to later check if they can be modified to extract from one of the
@@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LI->users()) {
+  for (auto *User : LoadOp->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LI->getType())->getNumElements();
+      cast<FixedVectorType>(LoadOp->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
                           NumLoadElements))
     return false;
 
+  // If this is a vp.load, record its mask (NOT shuffle mask).
+  BitVector MaskedIndices(NumLoadElements);
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
+    assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+           NumLoadElements);
+    if (auto *Splat = Mask->getSplatValue()) {
+      // All-zeros mask, bail out early.
+      if (Splat->isZeroValue())
+        return false;
+    } else {
+      for (unsigned i = 0U; i < NumLoadElements; ++i) {
+        if (Mask->getAggregateElement(i)->isZeroValue())
+          MaskedIndices.set(i);
+      }
+    }
+  }
+
   // Holds the corresponding index for each DE-interleave shuffle.
   SmallVector<unsigned, 4> Indices;
 
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
       Indices.push_back(Index);
   }
 
@@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+  // Check if we extract only the unmasked elements.
+  if (MaskedIndices.any()) {
+    if (any_of(Shuffles, [&](const auto *Shuffle) {
+          ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+          for (int Idx : ShuffleMask) {
+            if (Idx < 0)
+              continue;
+            if (MaskedIndices.test(unsigned(Idx)))
+              return true;
+          }
+          return false;
+        })) {
+      LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
+                        << "shufflevector\n");
+      return false;
+    }
+  }
+  // Check if we extract only the elements within evl.
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
+    if (any_of(Shuffles, [&](const auto *Shuffle) {
+          ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+          for (int Idx : ShuffleMask) {
+            if (Idx < 0)
+              continue;
+            if (unsigned(Idx) >= EVL)
+              return true;
+          }
+          return false;
+        })) {
+      LLVM_DEBUG(
+          dbgs() << "IA: trying to extract an element out of EVL range\n");
+      return false;
+    }
+  }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
 
   // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+  if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
     // If Extracts is not empty, tryReplaceExtracts made changes earlier.
     return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LI);
+  DeadInsts.insert(LoadOp);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LI)
+    if (NewSVI1->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LI)
+    if (NewSVI2->getOperand(0) == LoadOp)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (!SI->isSimple())
-    return false;
+    Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  Value *StoredValue;
+  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+    if (!SI->isSimple())
+      return false;
+    StoredValue = SI->getValueOperand();
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    // Require a constant mask and evl.
+    if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
+        !isa<ConstantInt>(VPStore->getArgOperand(3)))
+      return false;
+    StoredValue = VPStore->getArgOperand(0);
+  } else {
+    llvm_unreachable("unsupported store operation");
+  }
 
-  auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
   if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
+  unsigned NumStoredElements =
+      cast<FixedVectorType>(SVI->getType())->getNumElements();
+  // If this is a vp.store, record its mask (NOT shuffle mask).
+  BitVector MaskedIndices(NumStoredElements);
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
+    assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+           NumStoredElements);
+    if (auto *Splat = Mask->getSplatValue()) {
+      // All-zeros mask, bail out early.
+      if (Splat->isZeroValue())
+        return false;
+    } else {
+      for (unsigned i = 0U; i < NumStoredElements; ++i) {
+        if (Mask->getAggregateElement(i)->isZeroValue())
+          MaskedIndices.set(i);
+      }
+    }
+  }
+
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+  // Check if we store only the unmasked elements.
+  if (MaskedIndices.any()) {
+    if (any_of(SVI->getShuffleMask(), [&](int Idx) {
+          return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
+        })) {
+      LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
+      return false;
+    }
+  }
+  // Check if we store only the elements within evl.
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
+    if (any_of(SVI->getShuffleMask(),
+               [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
+      LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+      return false;
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
 
   // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+  if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
     return false;
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(SI);
+  DeadInsts.insert(StoreOp);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
   SmallSetVector<Instruction *, 32> DeadInsts;
   bool Changed = false;
 
+  using namespace PatternMatch;
   for (auto &I : instructions(F)) {
-    if (auto *LI = dyn_cast<LoadInst>(&I))
-      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Load(m_Value()),
+                              m_Intrinsic<Intrinsic::vp_load>())))
+      Changed |= lowerInterleavedLoad(&I, DeadInsts);
 
-    if (auto *SI = dyn_cast<StoreInst>(&I))
-      Changed |= lowerInterleavedStore(SI, DeadInsts);
+    if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+                              m_Intrinsic<Intrinsic::vp_store>())))
+      Changed |= lowerInterleavedStore(&I, DeadInsts);
 
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       // At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..d74cc3161684d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(LoadOp);
+  if (!LI)
+    return false;
+
   const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
@@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
 
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
+  auto *SI = dyn_cast<StoreInst>(StoreOp);
+  if (!SI)
+    return false;
+
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..34446abb1474c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..64d12a0eb1d9b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(LoadOp);
+  if (!LI)
+    return false;
+
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
@@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
+  auto *SI = dyn_cast<StoreInst>(StoreOp);
+  if (!SI)
+    return false;
+
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..635a6cd226936 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
 
     unsigned getMaxSupportedInterleaveFactor() const override;
 
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *LoadOp,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..9558783963500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Indices.size() == Shuffles.size());
 
-  IRBuilder<> Builder(LI);
+  const DataLayout &DL = LoadOp->getDataLayout();
+  IRBuilder<> Builder(LoadOp);
 
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
-  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
-                                    LI->getPointerAddressSpace(),
-                                    LI->getDataLayout()))
+
+  Align PtrAlignment;
+  unsigned PtrAddrSpace;
+  Value *BaseAddr;
+  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+    BaseAddr = LI->getPointerOperand();
+    PtrAlignment = LI->getAlign();
+    PtrAddrSpace = LI->getPointerAddressSpace();
+  } else {
+    auto *VPLoad = cast<VPIntrinsic>(LoadOp);
+    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+    BaseAddr = VPLoad->getArgOperand(0);
+    PtrAlignment = VPLoad->getParamAlign(0).value_or(
+        DL.getABITypeAlign(VTy->getElementType()));
+    PtrAddrSpace = cast<PointerType>(BaseAddr->getType())->getAddressSpace();
+  }
+
+  if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace,
+                                    DL))
     return false;
 
-  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
+  auto *XLenTy = Type::getIntNTy(LoadOp->getContext(), Subtarget.getXLen());
 
   // If the segment load is going to be performed segment at a time anyways
   // and there's only one element used, use a strided load instead.  This
@@ -23397,7 +23414,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
     unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8;
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
+    Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset);
     Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
     Value *VL = Builder.getInt32(VTy->getNumElements());
 
@@ -23406,16 +23423,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
                                 {VTy, BasePtr->getType(), Stride->getType()},
                                 {BasePtr, Stride, Mask, VL});
     CI->addParamAttr(
-        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
+        0, Attribute::getWithAlignment(CI->getContext(), PtrAlignment));
     Shuffles[0]->replaceAllUsesWith(CI);
     return true;
   };
 
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
 
-  CallInst *VlsegN = Builder.CreateIntrinsic(
-      FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy},
-      {LI->getPointerOperand(), VL});
+  CallInst *VlsegN = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                             {VTy, BaseAddr->getType(), XLenTy},
+                                             {BaseAddr, VL});
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
     Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -23447,21 +23464,39 @@ static const Intrinsic::ID FixedVssegIntrIds[] = {
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
-  IRBuilder<> Builder(SI);
+  const DataLayout &DL = StoreOp->getDataLayout();
+  IRBuilder<> Builder(StoreOp);
+
   auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
                                    ShuffleVTy->getNumElements() / Factor);
-  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
-                                    SI->getPointerAddressSpace(),
-                                    SI->getDataLayout()))
+
+  Align PtrAlignment;
+  unsigned PtrAddrSpace;
+  Value *BaseAddr;
+  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+    BaseAddr = SI->getPointerOperand();
+    PtrAlignment = SI->getAlign();
+    PtrAddrSpace = SI->getPointerAddressSpace();
+  } else {
+    auto *VPStore = cast<VPIntrinsic>(StoreOp);
+    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+    BaseAddr = VPStore->getArgOperand(1);
+    PtrAlignment = VPStore->getParamAlign(1).value_or(
+        DL.getABITypeAlign(VTy->getElementType()));
+    PtrAddrSpace = cast<PointerType>(BaseAddr->getType())->getAddressSpace();
+  }
+
+  if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace,
+                                    DL))
     return false;
 
-  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
+  auto *XLenTy = Type::getIntNTy(StoreOp->getContext(), Subtarget.getXLen());
 
   unsigned Index;
   // If the segment store only has one active lane (i.e. the interleave is
@@ -23474,7 +23509,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
     auto *DataVTy = cast<FixedVectorType>(Data->getType());
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
+    Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset);
     Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
     Value *VL = Builder.getInt32(VTy->getNumElements());
 
@@ -23483,14 +23518,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
         {Data->getType(), BasePtr->getType(), Stride->getType()},
         {Data, BasePtr, Stride, Mask, VL});
     CI->addParamAttr(
-        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+        1, Attribute::getWithAlignment(CI->getContext(), PtrAlignment));
 
     return true;
   }
 
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      SI->getModule(), FixedVssegIntrIds[Factor - 2],
-      {VTy, SI->getPointerOperandType(), XLenTy});
+      StoreOp->getModule(), FixedVssegIntrIds[Factor - 2],
+      {VTy, BaseAddr->getType(), XLenTy});
 
   SmallVector<Value *, 10> Ops;
   SmallVector<int, 16> NewShuffleMask;
@@ -23510,7 +23545,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
   // potentially under larger LMULs) because we checked that the fixed vector
   // type fits in isLegalInterleavedAccessType
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
-  Ops.append({SI->getPointerOperand(), VL});
+  Ops.append({BaseAddr, VL});
 
   Builder.CreateCall(VssegNFunc, Ops);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 5ebdbbd51f2b1..5df510c08e0da 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -903,12 +903,12 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;
 
-  bool lowerInterleavedLoad(LoadInst *LI,
+  bool lowerInterleavedLoad(Instruction *LoadOp,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
 
-  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 4a2b35e9efe7c..da5baaf1bf5af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1650,14 +1650,14 @@ namespace llvm {
 
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedLoad(LoadInst *LI,
+    bool lowerInterleavedLoad(Instruction *LoadOp,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 1eb47e3b2cd18..99fbc249a1179 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // number of shuffles and ISA.
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
-    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -809,6 +809,10 @@ bool X86TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
+  auto *LI = dyn_cast<LoadInst>(LoadOp);
+  if (!LI)
+    return false;
+
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
@@ -817,7 +821,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
 
-bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -827,6 +831,10 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
              0 &&
          "Invalid interleaved store");
 
+  auto *SI = dyn_cast<StoreInst>(StoreOp);
+  if (!SI)
+    return false;
+
   // Holds the indices of SVI that correspond to the starting index of each
   // interleaved shuffle.
   auto Mask = SVI->getShuffleMask();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f6bdd45330384..dc7ca2317ec9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -176,6 +176,164 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i
   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7
 }
 
+define {<4 x i32>, <4 x i32>} @vpload_factor2(ptr %ptr) {
+; CHECK-LABEL: vpload_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+; Load a larger vector but only deinterleave a subset of the elements.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, i32 12)
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
+; CHECK-LABEL: vpload_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i32> @llvm.vp.load.v16i32.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) {
+; CHECK-LABEL: vpload_factor5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20)
+  %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+  %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
+  %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
+  %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
+  %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3
+  %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4
+  ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor6(ptr %ptr) {
+; CHECK-LABEL: vpload_factor6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i16> @llvm.vp.load.v12i16.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 0, i32 6>
+  %v1 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 1, i32 7>
+  %v2 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 2, i32 8>
+  %v3 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 3, i32 9>
+  %v4 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 4, i32 10>
+  %v5 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> <i32 5, i32 11>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor7(ptr %ptr) {
+; CHECK-LABEL: vpload_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <14 x i16> @llvm.vp.load.v14i16.p0(ptr %ptr, <14 x i1> splat (i1 true), i32 14)
+  %v0 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
+  %v1 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
+  %v2 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
+  %v3 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
+  %v4 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
+  %v5 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
+  %v6 = shufflevector <14 x i16> %interleaved.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6
+}
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vpload_factor8(ptr %ptr) {
+; CHECK-LABEL: vpload_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <16 x i16> @llvm.vp.load.v16i16.p0(ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  %v0 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 0, i32 8>
+  %v1 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 1, i32 9>
+  %v2 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 2, i32 10>
+  %v3 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 3, i32 11>
+  %v4 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 4, i32 12>
+  %v5 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 5, i32 13>
+  %v6 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 6, i32 14>
+  %v7 = shufflevector <16 x i16> %interleaved.vec, <16 x i16> poison, <2 x i32> <i32 7, i32 15>
+  %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0
+  %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1
+  %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2
+  %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3
+  %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4
+  %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5
+  %res6 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5, <2 x i16> %v6, 6
+  %res7 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res6, <2 x i16> %v7, 7
+  ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res7
+}
+
 ; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg
 define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) {
 ; RV32-LABEL: load_factor6_too_big:
@@ -192,8 +350,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI8_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI8_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI16_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI16_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -278,12 +436,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI8_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI8_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI16_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI16_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -462,8 +620,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI8_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -527,16 +685,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI16_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI16_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -564,14 +722,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI8_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI8_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI8_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI16_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI16_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI8_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI16_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -658,8 +816,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI8_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI8_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI16_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI16_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -847,8 +1005,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI8_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI16_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI16_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -882,8 +1040,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI8_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI8_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI16_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI16_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -967,12 +1125,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI8_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI16_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI8_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI16_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1023,8 +1181,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI8_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI8_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI16_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1196,6 +1354,73 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2
   ret void
 }
 
+define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
+; CHECK-LABEL: vpstore_factor2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg2e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  tail call void @llvm.vp.store.v8i32.p0(<8 x i32> %interleaved.vec, ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  ret void
+}
+
+define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: vpstore_factor3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  ret void
+}
+
+define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+; CHECK-LABEL: vpstore_factor4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  ret void
+}
+
+define void @vpstore_factor5(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
+; CHECK-LABEL: vpstore_factor5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg5e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s2 = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s3 = shufflevector <4 x i32> %v4, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s2, <16 x i32> %s3, <20 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 1, i32 5, i32 9, i32 13, i32 17, i32 2, i32 6, i32 10, i32 14, i32 18, i32 3, i32 7, i32 11, i32 15, i32 19>
+  tail call void @llvm.vp.store.v20i32.p0(<20 x i32> %interleaved.vec, ptr %ptr, <20 x i1> splat (i1 true), i32 20)
+  ret void
+}
+
+define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) {
+; CHECK-LABEL: vpstore_factor6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s3 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s2, <8 x i16> %s3, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  tail call void @llvm.vp.store.v12i16.p0(<12 x i16> %interleaved.vec, ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  ret void
+}
+
 
 define <4 x i32> @load_factor2_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor2_one_active:
@@ -1368,3 +1593,157 @@ define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
   store <16 x i32> %v0, ptr %ptr
   ret void
 }
+
+; Negative tests
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
+; RV32-LABEL: invalid_vp_mask:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 73
+; RV32-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV32-NEXT:    vmv.s.x v11, a1
+; RV32-NEXT:    lui a1, 1
+; RV32-NEXT:    vmv.v.i v10, 8
+; RV32-NEXT:    addi a1, a1, -33
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    li a1, 146
+; RV32-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0), v0.t
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    lui a1, %hi(.LCPI40_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI40_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v21, (a1)
+; RV32-NEXT:    vcompress.vm v8, v12, v11
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vmv1r.v v0, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v20
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v21
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: invalid_vp_mask:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a1, 73
+; RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; RV64-NEXT:    vmv.s.x v11, a1
+; RV64-NEXT:    li a1, 146
+; RV64-NEXT:    vmv.s.x v20, a1
+; RV64-NEXT:    lui a1, 1
+; RV64-NEXT:    vmv.v.i v10, 8
+; RV64-NEXT:    addi a1, a1, -33
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    li a1, 36
+; RV64-NEXT:    vsetivli zero, 12, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0), v0.t
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v11
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vmv1r.v v0, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v20
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a1
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
+define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
+; RV32-LABEL: invalid_vp_evl:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; RV32-NEXT:    vle32.v v12, (a0)
+; RV32-NEXT:    li a0, 73
+; RV32-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV32-NEXT:    vmv.v.i v0, 8
+; RV32-NEXT:    vmv.s.x v10, a0
+; RV32-NEXT:    li a0, 146
+; RV32-NEXT:    vmv.s.x v11, a0
+; RV32-NEXT:    lui a0, %hi(.LCPI41_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI41_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v20, (a0)
+; RV32-NEXT:    li a0, 36
+; RV32-NEXT:    vcompress.vm v8, v12, v10
+; RV32-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV32-NEXT:    vslidedown.vi v16, v12, 8
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vcompress.vm v14, v12, v11
+; RV32-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV32-NEXT:    vmv.s.x v0, a0
+; RV32-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV32-NEXT:    vrgatherei16.vv v10, v12, v20
+; RV32-NEXT:    vmv1r.v v9, v14
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: invalid_vp_evl:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 10, e32, m4, ta, ma
+; RV64-NEXT:    vle32.v v12, (a0)
+; RV64-NEXT:    li a0, 73
+; RV64-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; RV64-NEXT:    vmv.v.i v0, 8
+; RV64-NEXT:    vmv.s.x v10, a0
+; RV64-NEXT:    li a0, 146
+; RV64-NEXT:    vmv.s.x v11, a0
+; RV64-NEXT:    li a0, 36
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vcompress.vm v8, v12, v10
+; RV64-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; RV64-NEXT:    vslidedown.vi v16, v12, 8
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v16, 1, v0.t
+; RV64-NEXT:    vcompress.vm v14, v12, v11
+; RV64-NEXT:    vrgather.vi v14, v16, 2, v0.t
+; RV64-NEXT:    vmv.s.x v0, a0
+; RV64-NEXT:    li a0, 3
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    addi a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 16
+; RV64-NEXT:    addi a0, a0, 2
+; RV64-NEXT:    vmerge.vvm v12, v16, v12, v0
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v9, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
+; RV64-NEXT:    vmv1r.v v9, v14
+; RV64-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 10)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}

From 66118fd0b8f1372bf573f8d710bf8d82ddbffdc2 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 16 Apr 2025 09:59:00 -0700
Subject: [PATCH 02/12] fixup! Propagate the mask to underlying intrinsics as
 well

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  20 +-
 llvm/include/llvm/IR/IntrinsicsRISCV.td       |  20 ++
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    | 188 +++++------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  12 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  12 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   4 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 313 +++++++++++-------
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |  15 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   4 +-
 llvm/lib/Target/X86/X86InterleavedAccess.cpp  |  12 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 109 +++---
 .../RISCV/rvv/vp-vector-interleaved-access.ll |  53 ---
 13 files changed, 397 insertions(+), 369 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 5407bf8b2ba13..6eaa1bae1da97 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p LoadOp is a vector load or vp.load instruction.
+  /// \p LI is the vector load instruction.
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedLoad(Instruction *LoadOp,
+  virtual bool lowerInterleavedLoad(LoadInst *LI,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices,
                                     unsigned Factor) const {
@@ -3171,24 +3171,23 @@ class TargetLoweringBase {
   /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
-  /// \p StoreOp is a vector store or vp.store instruction.
+  /// \p SI is the vector store instruction.
   /// \p SVI is the shufflevector to RE-interleave the stored vector.
   /// \p Factor is the interleave factor.
-  virtual bool lowerInterleavedStore(Instruction *StoreOp,
-                                     ShuffleVectorInst *SVI,
+  virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                      unsigned Factor) const {
     return false;
   }
 
-  /// Lower an interleaved load to target specific intrinsics. Return
+  /// Lower a deinterleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
   virtual bool
-  lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
-                                      ArrayRef<Value *> DeinterleaveRes) const {
+  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                           ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
@@ -3198,9 +3197,8 @@ class TargetLoweringBase {
   /// \p Store is the vp.store instruction.
   /// \p Mask is a mask value
   /// \p InterleaveOps is a list of values being interleaved.
-  virtual bool
-  lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
-                                     ArrayRef<Value *> InterleaveOps) const {
+  virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                                       ArrayRef<Value *> InterleaveOps) const {
     return false;
   }
 
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index 99cb557d9aa09..7da11b93f6b74 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -1705,12 +1705,23 @@ let TargetPrefix = "riscv" in {
 
   // Segment loads/stores for fixed vectors.
   foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
+    // Input: (pointer, vl)
     def int_riscv_seg # nf # _load
           : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
                                               !listsplat(LLVMMatchType<0>,
                                               !add(nf, -1))),
                                   [llvm_anyptr_ty, llvm_anyint_ty],
                                   [NoCapture<ArgIndex<0>>, IntrReadMem]>;
+    // Input: (pointer, mask, vl)
+    def int_riscv_seg # nf # _load_mask
+          : DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
+                                              !listsplat(LLVMMatchType<0>,
+                                              !add(nf, -1))),
+                                  [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                   llvm_anyint_ty],
+                                  [NoCapture<ArgIndex<0>>, IntrReadMem]>;
+
+    // Input: (<stored values>, pointer, vl)
     def int_riscv_seg # nf # _store
           : DefaultAttrsIntrinsic<[],
                                   !listconcat([llvm_anyvector_ty],
@@ -1718,6 +1729,15 @@ let TargetPrefix = "riscv" in {
                                                           !add(nf, -1)),
                                               [llvm_anyptr_ty, llvm_anyint_ty]),
                                   [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
+    // Input: (<stored values>, pointer, mask, vl)
+    def int_riscv_seg # nf # _store_mask
+          : DefaultAttrsIntrinsic<[],
+                                  !listconcat([llvm_anyvector_ty],
+                                              !listsplat(LLVMMatchType<0>,
+                                                          !add(nf, -1)),
+                                              [llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                               llvm_anyint_ty]),
+                                  [NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
   }
 
 } // TargetPrefix = "riscv"
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 83bde96cc725a..4bf05d06256e0 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -250,6 +250,31 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+/// Return true if it's a non-all-zeros, interleaving mask. For instance,
+/// 111000111000 is interleaved from three 1010 masks.
+/// \p SubMask returns the mask of individual lane.
+static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
+                                      SmallVectorImpl<Constant *> &LaneMask) {
+  unsigned LaneMaskLen = LaneMask.size();
+  if (auto *Splat = Mask->getSplatValue()) {
+    // All-zeros mask.
+    if (Splat->isZeroValue())
+      return false;
+    // All-ones mask.
+    std::fill(LaneMask.begin(), LaneMask.end(),
+              ConstantInt::getTrue(Mask->getContext()));
+  } else {
+    for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
+      Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor);
+      if (Ref != Mask->getAggregateElement(Idx))
+        return false;
+      LaneMask[Idx / Factor] = Ref;
+    }
+  }
+
+  return true;
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
     Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (isa<ScalableVectorType>(LoadOp->getType()))
@@ -261,8 +286,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
     assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
     // Require a constant mask and evl.
-    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
-        !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
       return false;
   } else {
     llvm_unreachable("unsupported load operation");
@@ -315,24 +339,6 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
                           NumLoadElements))
     return false;
 
-  // If this is a vp.load, record its mask (NOT shuffle mask).
-  BitVector MaskedIndices(NumLoadElements);
-  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
-    auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
-    assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
-           NumLoadElements);
-    if (auto *Splat = Mask->getSplatValue()) {
-      // All-zeros mask, bail out early.
-      if (Splat->isZeroValue())
-        return false;
-    } else {
-      for (unsigned i = 0U; i < NumLoadElements; ++i) {
-        if (Mask->getAggregateElement(i)->isZeroValue())
-          MaskedIndices.set(i);
-      }
-    }
-  }
-
   // Holds the corresponding index for each DE-interleave shuffle.
   SmallVector<unsigned, 4> Indices;
 
@@ -373,48 +379,35 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   bool BinOpShuffleChanged =
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
 
-  // Check if we extract only the unmasked elements.
-  if (MaskedIndices.any()) {
-    if (any_of(Shuffles, [&](const auto *Shuffle) {
-          ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
-          for (int Idx : ShuffleMask) {
-            if (Idx < 0)
-              continue;
-            if (MaskedIndices.test(unsigned(Idx)))
-              return true;
-          }
-          return false;
-        })) {
-      LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
-                        << "shufflevector\n");
-      return false;
-    }
-  }
-  // Check if we extract only the elements within evl.
+  // Check if the de-interleaved vp.load masks are the same.
+  unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
+  SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
-    uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
-    if (any_of(Shuffles, [&](const auto *Shuffle) {
-          ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
-          for (int Idx : ShuffleMask) {
-            if (Idx < 0)
-              continue;
-            if (unsigned(Idx) >= EVL)
-              return true;
-          }
-          return false;
-        })) {
-      LLVM_DEBUG(
-          dbgs() << "IA: trying to extract an element out of EVL range\n");
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
       return false;
-    }
   }
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
 
-  // Try to create target specific intrinsics to replace the load and shuffles.
-  if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
-    // If Extracts is not empty, tryReplaceExtracts made changes earlier.
-    return !Extracts.empty() || BinOpShuffleChanged;
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+    auto *MaskVec = ConstantVector::get(LaneMask);
+    // Sometimes the number of Shuffles might be less than Factor, we have to
+    // fill the gaps with null. Also, lowerDeinterleavedVPLoad
+    // expects them to be sorted.
+    SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
+    for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
+      ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
+  } else {
+    // Try to create target specific intrinsics to replace the load and
+    // shuffles.
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
+                                   Factor))
+      // If Extracts is not empty, tryReplaceExtracts made changes earlier.
+      return !Extracts.empty() || BinOpShuffleChanged;
   }
 
   DeadInsts.insert_range(Shuffles);
@@ -530,9 +523,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
     StoredValue = SI->getValueOperand();
   } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
     assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
-    // Require a constant mask and evl.
-    if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
-        !isa<ConstantInt>(VPStore->getArgOperand(3)))
+    // Require a constant mask.
+    if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
       return false;
     StoredValue = VPStore->getArgOperand(0);
   } else {
@@ -545,53 +537,53 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
 
   unsigned NumStoredElements =
       cast<FixedVectorType>(SVI->getType())->getNumElements();
-  // If this is a vp.store, record its mask (NOT shuffle mask).
-  BitVector MaskedIndices(NumStoredElements);
-  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
-    auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
-    assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
-           NumStoredElements);
-    if (auto *Splat = Mask->getSplatValue()) {
-      // All-zeros mask, bail out early.
-      if (Splat->isZeroValue())
-        return false;
-    } else {
-      for (unsigned i = 0U; i < NumStoredElements; ++i) {
-        if (Mask->getAggregateElement(i)->isZeroValue())
-          MaskedIndices.set(i);
-      }
-    }
-  }
-
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
   if (!isReInterleaveMask(SVI, Factor, MaxFactor))
     return false;
+  assert(NumStoredElements % Factor == 0 &&
+         "number of stored element should be a multiple of Factor");
 
-  // Check if we store only the unmasked elements.
-  if (MaskedIndices.any()) {
-    if (any_of(SVI->getShuffleMask(), [&](int Idx) {
-          return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
-        })) {
-      LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
-      return false;
-    }
-  }
-  // Check if we store only the elements within evl.
+  // Check if the de-interleaved vp.store masks are the same.
+  unsigned LaneMaskLen = NumStoredElements / Factor;
+  SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
-    uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
-    if (any_of(SVI->getShuffleMask(),
-               [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
-      LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+    if (!isInterleavedConstantMask(
+            Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
       return false;
-    }
   }
 
   LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
 
-  // Try to create target specific intrinsics to replace the store and shuffle.
-  if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
-    return false;
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    IRBuilder<> Builder(VPStore);
+    // We need to effectively de-interleave the shufflemask
+    // because lowerInterleavedVPStore expected individual de-interleaved
+    // values.
+    SmallVector<Value *, 10> NewShuffles;
+    SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
+    auto ShuffleMask = SVI->getShuffleMask();
+
+    for (unsigned i = 0; i < Factor; i++) {
+      for (unsigned j = 0; j < LaneMaskLen; j++)
+        NewShuffleMask[j] = ShuffleMask[i + Factor * j];
+
+      NewShuffles.push_back(Builder.CreateShuffleVector(
+          SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
+    }
+
+    // Try to create target specific intrinsics to replace the vp.store and
+    // shuffle.
+    if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
+                                      NewShuffles))
+      // We already created new shuffles.
+      return true;
+  } else {
+    // Try to create target specific intrinsics to replace the store and
+    // shuffle.
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
+      return false;
+  }
 
   // Already have a new target specific interleaved store. Erase the old store.
   DeadInsts.insert(StoreOp);
@@ -806,8 +798,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
-                                                  DeinterleaveValues))
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
@@ -859,8 +850,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
 
     // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
-                                                 InterleaveValues))
+    if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
       return false;
   } else {
     auto *SI = cast<StoreInst>(StoredBy);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d74cc3161684d..e366d7cb54490 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
-    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -17184,10 +17184,6 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  auto *LI = dyn_cast<LoadInst>(LoadOp);
-  if (!LI)
-    return false;
-
   const DataLayout &DL = LI->getDataLayout();
 
   VectorType *VTy = Shuffles[0]->getType();
@@ -17363,17 +17359,13 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
+bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
 
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  auto *SI = dyn_cast<StoreInst>(StoreOp);
-  if (!SI)
-    return false;
-
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 34446abb1474c..0d51ef2be8631 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
-  bool lowerInterleavedLoad(Instruction *LoadOp,
+  bool lowerInterleavedLoad(LoadInst *LI,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
-  bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 64d12a0eb1d9b..2290ac2728c6d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
-    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -21616,10 +21616,6 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  auto *LI = dyn_cast<LoadInst>(LoadOp);
-  if (!LI)
-    return false;
-
   auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
   Type *EltTy = VecTy->getElementType();
 
@@ -21754,16 +21750,12 @@ bool ARMTargetLowering::lowerInterleavedLoad(
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
+bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  auto *SI = dyn_cast<StoreInst>(StoreOp);
-  if (!SI)
-    return false;
-
   auto *VecTy = cast<FixedVectorType>(SVI->getType());
   assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 635a6cd226936..9fad056edd3f1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
 
     unsigned getMaxSupportedInterleaveFactor() const override;
 
-    bool lowerInterleavedLoad(Instruction *LoadOp,
+    bool lowerInterleavedLoad(LoadInst *LI,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
-    bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9558783963500..df545f5e06c18 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1731,6 +1731,13 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::riscv_seg6_load:
   case Intrinsic::riscv_seg7_load:
   case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask:
     return SetRVVLoadStoreInfo(/*PtrOp*/ 0, /*IsStore*/ false,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_seg2_store:
@@ -1744,6 +1751,17 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 2,
                                /*IsStore*/ true,
                                /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask:
+    // Operands are (vec, ..., vec, ptr, mask, vl)
+    return SetRVVLoadStoreInfo(/*PtrOp*/ I.arg_size() - 3,
+                               /*IsStore*/ true,
+                               /*IsUnitStrided*/ false, /*UsePtrVal*/ true);
   case Intrinsic::riscv_vle:
   case Intrinsic::riscv_vle_mask:
   case Intrinsic::riscv_vleff:
@@ -10450,13 +10468,20 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::riscv_seg5_load:
   case Intrinsic::riscv_seg6_load:
   case Intrinsic::riscv_seg7_load:
-  case Intrinsic::riscv_seg8_load: {
+  case Intrinsic::riscv_seg8_load:
+  case Intrinsic::riscv_seg2_load_mask:
+  case Intrinsic::riscv_seg3_load_mask:
+  case Intrinsic::riscv_seg4_load_mask:
+  case Intrinsic::riscv_seg5_load_mask:
+  case Intrinsic::riscv_seg6_load_mask:
+  case Intrinsic::riscv_seg7_load_mask:
+  case Intrinsic::riscv_seg8_load_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VlsegInts[7] = {
-        Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
-        Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5,
-        Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
-        Intrinsic::riscv_vlseg8};
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask};
     unsigned NF = Op->getNumValues() - 1;
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
@@ -10466,7 +10491,19 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    // Masked: (pointer, mask, vl)
+    // Non-masked: (pointer, vl)
+    bool IsMasked = Op.getNumOperands() > 4;
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask =
+        IsMasked ? Op.getOperand(3) : getAllOnesMask(ContainerVT, VL, DL, DAG);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+    }
+
     SDValue IntID = DAG.getTargetConstant(VlsegInts[NF - 2], DL, XLenVT);
     auto *Load = cast<MemIntrinsicSDNode>(Op);
 
@@ -10476,7 +10513,10 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
         IntID,
         DAG.getUNDEF(VecTupTy),
         Op.getOperand(2),
+        Mask,
         VL,
+        DAG.getTargetConstant(
+            RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT),
         DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
     SDValue Result =
         DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
@@ -10536,15 +10576,39 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::riscv_seg5_store:
   case Intrinsic::riscv_seg6_store:
   case Intrinsic::riscv_seg7_store:
-  case Intrinsic::riscv_seg8_store: {
+  case Intrinsic::riscv_seg8_store:
+  case Intrinsic::riscv_seg2_store_mask:
+  case Intrinsic::riscv_seg3_store_mask:
+  case Intrinsic::riscv_seg4_store_mask:
+  case Intrinsic::riscv_seg5_store_mask:
+  case Intrinsic::riscv_seg6_store_mask:
+  case Intrinsic::riscv_seg7_store_mask:
+  case Intrinsic::riscv_seg8_store_mask: {
     SDLoc DL(Op);
     static const Intrinsic::ID VssegInts[] = {
-        Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
-        Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
-        Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
-        Intrinsic::riscv_vsseg8};
-    // Operands are (chain, int_id, vec*, ptr, vl)
-    unsigned NF = Op->getNumOperands() - 4;
+        Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
+        Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
+        Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask,
+        Intrinsic::riscv_vsseg8_mask};
+
+    bool IsMasked = false;
+    switch (IntNo) {
+    case Intrinsic::riscv_seg2_store_mask:
+    case Intrinsic::riscv_seg3_store_mask:
+    case Intrinsic::riscv_seg4_store_mask:
+    case Intrinsic::riscv_seg5_store_mask:
+    case Intrinsic::riscv_seg6_store_mask:
+    case Intrinsic::riscv_seg7_store_mask:
+    case Intrinsic::riscv_seg8_store_mask:
+      IsMasked = true;
+      break;
+    default:
+      break;
+    }
+
+    // Non-masked: (chain, int_id, vec*, ptr, vl)
+    // Masked: (chain, int_id, vec*, ptr, mask, vl)
+    unsigned NF = Op->getNumOperands() - (IsMasked ? 5 : 4);
     assert(NF >= 2 && NF <= 8 && "Unexpected seg number");
     MVT XLenVT = Subtarget.getXLenVT();
     MVT VT = Op->getOperand(2).getSimpleValueType();
@@ -10553,7 +10617,16 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                   ContainerVT.getScalarSizeInBits();
     EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, NF);
 
-    SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT);
+    SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+    SDValue Mask = IsMasked ? Op.getOperand(Op.getNumOperands() - 2)
+                            : getAllOnesMask(ContainerVT, VL, DL, DAG);
+    MVT MaskVT = Mask.getSimpleValueType();
+    if (MaskVT.isFixedLengthVector()) {
+      MVT MaskContainerVT =
+          ::getContainerForFixedLengthVector(DAG, MaskVT, Subtarget);
+      Mask = convertToScalableVector(MaskContainerVT, Mask, DAG, Subtarget);
+    }
+
     SDValue IntID = DAG.getTargetConstant(VssegInts[NF - 2], DL, XLenVT);
     SDValue Ptr = Op->getOperand(NF + 2);
 
@@ -10572,6 +10645,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         IntID,
         StoredVal,
         Ptr,
+        Mask,
         VL,
         DAG.getTargetConstant(Log2_64(VT.getScalarSizeInBits()), DL, XLenVT)};
 
@@ -23376,36 +23450,19 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool RISCVTargetLowering::lowerInterleavedLoad(
-    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Indices.size() == Shuffles.size());
 
-  const DataLayout &DL = LoadOp->getDataLayout();
-  IRBuilder<> Builder(LoadOp);
+  IRBuilder<> Builder(LI);
 
   auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType());
-
-  Align PtrAlignment;
-  unsigned PtrAddrSpace;
-  Value *BaseAddr;
-  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
-    BaseAddr = LI->getPointerOperand();
-    PtrAlignment = LI->getAlign();
-    PtrAddrSpace = LI->getPointerAddressSpace();
-  } else {
-    auto *VPLoad = cast<VPIntrinsic>(LoadOp);
-    assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
-    BaseAddr = VPLoad->getArgOperand(0);
-    PtrAlignment = VPLoad->getParamAlign(0).value_or(
-        DL.getABITypeAlign(VTy->getElementType()));
-    PtrAddrSpace = cast<PointerType>(BaseAddr->getType())->getAddressSpace();
-  }
-
-  if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace,
-                                    DL))
+  if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(),
+                                    LI->getPointerAddressSpace(),
+                                    LI->getDataLayout()))
     return false;
 
-  auto *XLenTy = Type::getIntNTy(LoadOp->getContext(), Subtarget.getXLen());
+  auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
   // If the segment load is going to be performed segment at a time anyways
   // and there's only one element used, use a strided load instead.  This
@@ -23414,7 +23471,7 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
     unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8;
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset);
+    Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset);
     Value *Mask = Builder.getAllOnesMask(VTy->getElementCount());
     Value *VL = Builder.getInt32(VTy->getNumElements());
 
@@ -23423,16 +23480,16 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
                                 {VTy, BasePtr->getType(), Stride->getType()},
                                 {BasePtr, Stride, Mask, VL});
     CI->addParamAttr(
-        0, Attribute::getWithAlignment(CI->getContext(), PtrAlignment));
+        0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign()));
     Shuffles[0]->replaceAllUsesWith(CI);
     return true;
   };
 
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
 
-  CallInst *VlsegN = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
-                                             {VTy, BaseAddr->getType(), XLenTy},
-                                             {BaseAddr, VL});
+  CallInst *VlsegN = Builder.CreateIntrinsic(
+      FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy},
+      {LI->getPointerOperand(), VL});
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
     Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -23464,39 +23521,21 @@ static const Intrinsic::ID FixedVssegIntrIds[] = {
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vsseg3 instruction in CodeGen.
-bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
+bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
-  const DataLayout &DL = StoreOp->getDataLayout();
-  IRBuilder<> Builder(StoreOp);
-
+  IRBuilder<> Builder(SI);
   auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
                                    ShuffleVTy->getNumElements() / Factor);
-
-  Align PtrAlignment;
-  unsigned PtrAddrSpace;
-  Value *BaseAddr;
-  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
-    BaseAddr = SI->getPointerOperand();
-    PtrAlignment = SI->getAlign();
-    PtrAddrSpace = SI->getPointerAddressSpace();
-  } else {
-    auto *VPStore = cast<VPIntrinsic>(StoreOp);
-    assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
-    BaseAddr = VPStore->getArgOperand(1);
-    PtrAlignment = VPStore->getParamAlign(1).value_or(
-        DL.getABITypeAlign(VTy->getElementType()));
-    PtrAddrSpace = cast<PointerType>(BaseAddr->getType())->getAddressSpace();
-  }
-
-  if (!isLegalInterleavedAccessType(VTy, Factor, PtrAlignment, PtrAddrSpace,
-                                    DL))
+  if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(),
+                                    SI->getPointerAddressSpace(),
+                                    SI->getDataLayout()))
     return false;
 
-  auto *XLenTy = Type::getIntNTy(StoreOp->getContext(), Subtarget.getXLen());
+  auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
   unsigned Index;
   // If the segment store only has one active lane (i.e. the interleave is
@@ -23509,7 +23548,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
     auto *DataVTy = cast<FixedVectorType>(Data->getType());
     Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
     Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
-    Value *BasePtr = Builder.CreatePtrAdd(BaseAddr, Offset);
+    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
     Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
     Value *VL = Builder.getInt32(VTy->getNumElements());
 
@@ -23518,14 +23557,14 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
         {Data->getType(), BasePtr->getType(), Stride->getType()},
         {Data, BasePtr, Stride, Mask, VL});
     CI->addParamAttr(
-        1, Attribute::getWithAlignment(CI->getContext(), PtrAlignment));
+        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
 
     return true;
   }
 
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-      StoreOp->getModule(), FixedVssegIntrIds[Factor - 2],
-      {VTy, BaseAddr->getType(), XLenTy});
+      SI->getModule(), FixedVssegIntrIds[Factor - 2],
+      {VTy, SI->getPointerOperandType(), XLenTy});
 
   SmallVector<Value *, 10> Ops;
   SmallVector<int, 16> NewShuffleMask;
@@ -23545,7 +23584,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
   // potentially under larger LMULs) because we checked that the fixed vector
   // type fits in isLegalInterleavedAccessType
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
-  Ops.append({BaseAddr, VL});
+  Ops.append({SI->getPointerOperand(), VL});
 
   Builder.CreateCall(VssegNFunc, Ops);
 
@@ -23687,15 +23726,20 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
   if (N == 1)
     return true;
 
+  using namespace PatternMatch;
+  // Right now we're only recognizing the simplest pattern.
+  uint64_t C;
+  if (match(V, m_CombineOr(m_ConstantInt(C),
+                           m_c_Mul(m_Value(), m_ConstantInt(C)))) &&
+      C && C % N == 0)
+    return true;
+
   if (isPowerOf2_32(N)) {
     KnownBits KB = llvm::computeKnownBits(V, DL);
     return KB.countMinTrailingZeros() >= Log2_32(N);
   }
 
-  using namespace PatternMatch;
-  // Right now we're only recognizing the simplest pattern.
-  uint64_t C;
-  return match(V, m_c_Mul(m_Value(), m_ConstantInt(C))) && C && C % N == 0;
+  return false;
 }
 
 /// Lower an interleaved vp.load into a vlsegN intrinsic.
@@ -23727,7 +23771,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 /// TODO: We probably can loosen the dependency on matching extractvalue when
 /// dealing with factor of 2 (extractvalue is still required for most of other
 /// factors though).
-bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
+bool RISCVTargetLowering::lowerDeinterleavedVPLoad(
     VPIntrinsic *Load, Value *Mask,
     ArrayRef<Value *> DeinterleaveResults) const {
   assert(Mask && "Expect a valid mask");
@@ -23736,26 +23780,20 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 
   const unsigned Factor = DeinterleaveResults.size();
 
-  auto *WideVTy = dyn_cast<ScalableVectorType>(Load->getType());
-  // TODO: Support fixed vectors.
-  if (!WideVTy)
+  auto *VTy = dyn_cast<VectorType>(DeinterleaveResults[0]->getType());
+  if (!VTy)
     return false;
 
-  unsigned WideNumElements = WideVTy->getElementCount().getKnownMinValue();
-  assert(WideNumElements % Factor == 0 &&
-         "ElementCount of a wide load must be divisible by interleave factor");
-  auto *VTy =
-      VectorType::get(WideVTy->getScalarType(), WideNumElements / Factor,
-                      WideVTy->isScalableTy());
   auto &DL = Load->getModule()->getDataLayout();
   Align Alignment = Load->getParamAlign(0).value_or(
-      DL.getABITypeAlign(WideVTy->getElementType()));
+      DL.getABITypeAlign(VTy->getElementType()));
   if (!isLegalInterleavedAccessType(
           VTy, Factor, Alignment,
           Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL))
     return false;
 
   IRBuilder<> Builder(Load);
+
   Value *WideEVL = Load->getArgOperand(2);
   // Conservatively check if EVL is a multiple of factor, otherwise some
   // (trailing) elements might be lost after the transformation.
@@ -23767,49 +23805,64 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
-  static const Intrinsic::ID IntrMaskIds[] = {
-      Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
-      Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
-      Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
-      Intrinsic::riscv_vlseg8_mask,
-  };
+  Value *Return = nullptr;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    static const Intrinsic::ID FixedMaskedVlsegIntrIds[] = {
+        Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask,
+        Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask,
+        Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
+        Intrinsic::riscv_seg8_load_mask};
+
+    Return = Builder.CreateIntrinsic(FixedMaskedVlsegIntrIds[Factor - 2],
+                                     {FVTy, XLenTy},
+                                     {Load->getArgOperand(0), Mask, EVL});
+  } else {
+    static const Intrinsic::ID IntrMaskIds[] = {
+        Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
+        Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
+        Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask,
+        Intrinsic::riscv_vlseg8_mask,
+    };
 
-  unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
-  unsigned NumElts = VTy->getElementCount().getKnownMinValue();
-  Type *VecTupTy = TargetExtType::get(
-      Load->getContext(), "riscv.vector.tuple",
-      ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
-                              NumElts * SEW / 8),
-      Factor);
+    unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType());
+    unsigned NumElts = VTy->getElementCount().getKnownMinValue();
+    Type *VecTupTy = TargetExtType::get(
+        Load->getContext(), "riscv.vector.tuple",
+        ScalableVectorType::get(Type::getInt8Ty(Load->getContext()),
+                                NumElts * SEW / 8),
+        Factor);
 
-  Value *PoisonVal = PoisonValue::get(VecTupTy);
+    Value *PoisonVal = PoisonValue::get(VecTupTy);
 
-  Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-      Load->getModule(), IntrMaskIds[Factor - 2],
-      {VecTupTy, Mask->getType(), EVL->getType()});
+    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), IntrMaskIds[Factor - 2],
+        {VecTupTy, Mask->getType(), EVL->getType()});
 
-  Value *Operands[] = {PoisonVal,
-                       Load->getArgOperand(0),
-                       Mask,
-                       EVL,
-                       ConstantInt::get(XLenTy, RISCVVType::TAIL_AGNOSTIC |
-                                                    RISCVVType::MASK_AGNOSTIC),
-                       ConstantInt::get(XLenTy, Log2_64(SEW))};
+    Value *Operands[] = {
+        PoisonVal,
+        Load->getArgOperand(0),
+        Mask,
+        EVL,
+        ConstantInt::get(XLenTy,
+                         RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC),
+        ConstantInt::get(XLenTy, Log2_64(SEW))};
 
-  CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
+    CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
 
-  SmallVector<Type *, 8> AggrTypes{Factor, VTy};
-  Value *Return =
-      PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
-  Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-      Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
-  for (unsigned i = 0; i < Factor; ++i) {
-    Value *VecExtract =
-        Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
-    Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    SmallVector<Type *, 8> AggrTypes{Factor, VTy};
+    Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes));
+    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
+        Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy});
+    for (unsigned i = 0; i < Factor; ++i) {
+      Value *VecExtract =
+          Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)});
+      Return = Builder.CreateInsertValue(Return, VecExtract, i);
+    }
   }
 
   for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) {
+    if (!DIO)
+      continue;
     // We have to create a brand new ExtractValue to replace each
     // of these old ExtractValue instructions.
     Value *NewEV =
@@ -23840,7 +23893,7 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
 ///                               <vscale x 32 x i8> %load2, ptr %ptr,
 ///                               %mask,
 ///                               i64 %rvl)
-bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
+bool RISCVTargetLowering::lowerInterleavedVPStore(
     VPIntrinsic *Store, Value *Mask,
     ArrayRef<Value *> InterleaveOperands) const {
   assert(Mask && "Expect a valid mask");
@@ -23849,8 +23902,7 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
 
   const unsigned Factor = InterleaveOperands.size();
 
-  auto *VTy = dyn_cast<ScalableVectorType>(InterleaveOperands[0]->getType());
-  // TODO: Support fixed vectors.
+  auto *VTy = dyn_cast<VectorType>(InterleaveOperands[0]->getType());
   if (!VTy)
     return false;
 
@@ -23874,6 +23926,21 @@ bool RISCVTargetLowering::lowerInterleavedIntrinsicToVPStore(
       Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)),
       XLenTy);
 
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    static const Intrinsic::ID FixedMaskedVssegIntrIds[] = {
+        Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask,
+        Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask,
+        Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
+        Intrinsic::riscv_seg8_store_mask};
+
+    SmallVector<Value *, 8> Operands(InterleaveOperands.begin(),
+                                     InterleaveOperands.end());
+    Operands.append({Store->getArgOperand(1), Mask, EVL});
+    Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy},
+                            Operands);
+    return true;
+  }
+
   static const Intrinsic::ID IntrMaskIds[] = {
       Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask,
       Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 5df510c08e0da..8413a60b56cda 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -903,12 +903,12 @@ class RISCVTargetLowering : public TargetLowering {
 
   bool fallBackToDAGISel(const Instruction &Inst) const override;
 
-  bool lowerInterleavedLoad(Instruction *LoadOp,
+  bool lowerInterleavedLoad(LoadInst *LI,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
 
-  bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
+  bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool lowerDeinterleaveIntrinsicToLoad(
@@ -917,13 +917,12 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
-  bool lowerDeinterleavedIntrinsicToVPLoad(
-      VPIntrinsic *Load, Value *Mask,
-      ArrayRef<Value *> DeinterleaveRes) const override;
+  bool
+  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                           ArrayRef<Value *> DeinterleaveRes) const override;
 
-  bool lowerInterleavedIntrinsicToVPStore(
-      VPIntrinsic *Store, Value *Mask,
-      ArrayRef<Value *> InterleaveOps) const override;
+  bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
+                               ArrayRef<Value *> InterleaveOps) const override;
 
   bool supportKCFIBundles() const override { return true; }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index da5baaf1bf5af..4a2b35e9efe7c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1650,14 +1650,14 @@ namespace llvm {
 
     /// Lower interleaved load(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedLoad(Instruction *LoadOp,
+    bool lowerInterleavedLoad(LoadInst *LI,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
 
     /// Lower interleaved store(s) into target specific
     /// instructions/intrinsics.
-    bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
+    bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 99fbc249a1179..1eb47e3b2cd18 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 // number of shuffles and ISA.
 // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
 bool X86TargetLowering::lowerInterleavedLoad(
-    Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
+    LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
@@ -809,10 +809,6 @@ bool X86TargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  auto *LI = dyn_cast<LoadInst>(LoadOp);
-  if (!LI)
-    return false;
-
   // Create an interleaved access group.
   IRBuilder<> Builder(LI);
   X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
@@ -821,7 +817,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
   return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
 }
 
-bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
+bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
@@ -831,10 +827,6 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
              0 &&
          "Invalid interleaved store");
 
-  auto *SI = dyn_cast<StoreInst>(StoreOp);
-  if (!SI)
-    return false;
-
   // Holds the indices of SVI that correspond to the starting index of each
   // interleaved shuffle.
   auto Mask = SVI->getShuffleMask();
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index dc7ca2317ec9a..0eb2fc3152b64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -224,6 +224,23 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor4:
 ; CHECK:       # %bb.0:
@@ -350,8 +367,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI16_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI16_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI17_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI17_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -436,12 +453,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI16_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI16_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI16_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI17_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI17_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -620,8 +637,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI16_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -685,16 +702,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI16_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI16_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI16_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI17_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI17_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI16_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -722,14 +739,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI16_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI16_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI16_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI17_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI17_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI16_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI16_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI17_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -816,8 +833,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI16_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI16_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI17_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI17_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1005,8 +1022,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI16_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI16_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI17_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI17_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1040,8 +1057,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI16_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI16_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI17_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI17_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1125,12 +1142,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI16_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI17_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI16_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI17_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1181,8 +1198,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI16_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI16_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI17_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1378,6 +1395,20 @@ define void @vpstore_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %
   ret void
 }
 
+define void @vpstore_factor3_mask(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
+; CHECK-LABEL: vpstore_factor3_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 5
+; CHECK-NEXT:    vsseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  tail call void @llvm.vp.store.v12i32.p0(<12 x i32> %interleaved.vec, ptr %ptr, <12 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, i32 12)
+  ret void
+}
+
 define void @vpstore_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
 ; CHECK-LABEL: vpstore_factor4:
 ; CHECK:       # %bb.0:
@@ -1604,7 +1635,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v11, a1
 ; RV32-NEXT:    lui a1, 1
 ; RV32-NEXT:    vmv.v.i v10, 8
-; RV32-NEXT:    addi a1, a1, -33
+; RV32-NEXT:    addi a1, a1, -43
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    li a1, 146
@@ -1612,8 +1643,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI40_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI40_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI42_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI42_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1639,7 +1670,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV64-NEXT:    vmv.s.x v20, a1
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vmv.v.i v10, 8
-; RV64-NEXT:    addi a1, a1, -33
+; RV64-NEXT:    addi a1, a1, -43
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmv.s.x v0, a1
 ; RV64-NEXT:    li a1, 36
@@ -1667,7 +1698,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV64-NEXT:    vrgatherei16.vv v10, v12, v9
 ; RV64-NEXT:    vmv1r.v v9, v14
 ; RV64-NEXT:    ret
-  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i32 12)
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i32 12)
   %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
   %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
@@ -1688,8 +1719,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI41_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI41_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI43_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI43_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index d6e1af59e6341..d0f35aa8b85e9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -616,59 +616,6 @@ define void @not_balanced_store_tree(<vscale x 1 x i32> %v0, <vscale x 2 x i32>
   ret void
 }
 
-; We only support scalable vectors for now.
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @not_scalable_vectors(ptr %ptr, i32 %evl) {
-; RV32-LABEL: not_scalable_vectors:
-; RV32:       # %bb.0:
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    li a0, 32
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT:    vnsrl.wx v12, v8, a0
-; RV32-NEXT:    vnsrl.wi v11, v8, 0
-; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV32-NEXT:    vnsrl.wx v10, v11, a0
-; RV32-NEXT:    vnsrl.wi v8, v11, 0
-; RV32-NEXT:    vnsrl.wx v11, v12, a0
-; RV32-NEXT:    vnsrl.wi v9, v12, 0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_scalable_vectors:
-; RV64:       # %bb.0:
-; RV64-NEXT:    slli a1, a1, 34
-; RV64-NEXT:    srli a1, a1, 32
-; RV64-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
-; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    li a0, 32
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vnsrl.wx v12, v8, a0
-; RV64-NEXT:    vnsrl.wi v11, v8, 0
-; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; RV64-NEXT:    vnsrl.wx v10, v11, a0
-; RV64-NEXT:    vnsrl.wi v8, v11, 0
-; RV64-NEXT:    vnsrl.wx v11, v12, a0
-; RV64-NEXT:    vnsrl.wi v9, v12, 0
-; RV64-NEXT:    ret
-  %rvl = mul i32 %evl, 4
-  %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 %rvl)
-  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
-  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
-  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
-  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
-  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
-  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
-  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
-  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
-  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
-
-  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
-  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
-  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
-  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
-  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
-}
-
 define {<vscale x 2 x i32>, <vscale x 2 x i32>} @not_same_mask(<vscale x 2 x i1> %mask0, <vscale x 2 x i1> %mask1, ptr %ptr, i32 %evl) {
 ; RV32-LABEL: not_same_mask:
 ; RV32:       # %bb.0:

From 9fd8830f6a368b2019633964492a932f0c4b7d70 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 18 Apr 2025 10:37:21 -0700
Subject: [PATCH 03/12] fixup! Minor fixes and add more tests

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp    |   1 -
 .../rvv/fixed-vectors-interleaved-access.ll   | 107 ++++++++++++------
 2 files changed, 71 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 4bf05d06256e0..ce54eb130bb4b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,7 +45,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 0eb2fc3152b64..3df75060808b1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -207,6 +207,22 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
+; We only extract some of the fields.
+define {<4 x i32>, <4 x i32>} @vpload_factor3_partial(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_partial:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg3e32.v v7, (a0)
+; CHECK-NEXT:    vmv1r.v v8, v7
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>} poison, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v2, 1
+  ret {<4 x i32>, <4 x i32>} %res1
+}
+
 ; Load a larger vector but only deinterleave a subset of the elements.
 define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor3_v16i32:
@@ -224,6 +240,7 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_v16i32(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
+; Make sure the mask is propagated.
 define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor3_mask:
 ; CHECK:       # %bb.0:
@@ -241,6 +258,24 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_mask(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
 }
 
+; Poison/undef in the shuffle mask shouldn't affect anything.
+define {<4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor3_poison_shufflemask(ptr %ptr) {
+; CHECK-LABEL: vpload_factor3_poison_shufflemask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vlseg3e32.v v8, (a0), v0.t
+; CHECK-NEXT:    ret
+  %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, i32 12)
+  %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 poison, i32 10>
+  %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0
+  %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1
+  %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2
+  ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor4:
 ; CHECK:       # %bb.0:
@@ -367,8 +402,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI17_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI17_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI19_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI19_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -453,12 +488,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI17_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI17_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI17_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI19_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI19_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -637,8 +672,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI17_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -702,16 +737,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI17_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI17_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI17_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI19_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI19_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI17_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -739,14 +774,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI17_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI17_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI17_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI19_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI19_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI17_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI17_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI19_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -833,8 +868,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI17_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI17_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI19_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI19_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1022,8 +1057,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI17_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI17_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI19_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI19_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1057,8 +1092,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI17_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI17_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI19_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI19_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1142,12 +1177,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI17_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI19_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI17_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI19_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1198,8 +1233,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI17_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI17_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI19_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1643,8 +1678,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI42_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI42_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI44_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI44_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1719,8 +1754,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI43_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI43_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI45_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI45_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36

From ae3113201afc3e1d3fe656a969a0def0dc44a1ba Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 18 Apr 2025 13:41:24 -0700
Subject: [PATCH 04/12] fixup! Address reviewer comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index ce54eb130bb4b..66c1cd98ef602 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -249,22 +249,17 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
-/// Return true if it's a non-all-zeros, interleaving mask. For instance,
-/// 111000111000 is interleaved from three 1010 masks.
-/// \p SubMask returns the mask of individual lane.
+/// Return true if it's an interleaving mask. For instance, 111000111000 is
+/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
+/// lane.
 static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
                                       SmallVectorImpl<Constant *> &LaneMask) {
   unsigned LaneMaskLen = LaneMask.size();
   if (auto *Splat = Mask->getSplatValue()) {
-    // All-zeros mask.
-    if (Splat->isZeroValue())
-      return false;
-    // All-ones mask.
-    std::fill(LaneMask.begin(), LaneMask.end(),
-              ConstantInt::getTrue(Mask->getContext()));
+    std::fill(LaneMask.begin(), LaneMask.end(), Splat);
   } else {
     for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
-      Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor);
+      Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
       if (Ref != Mask->getAggregateElement(Idx))
         return false;
       LaneMask[Idx / Factor] = Ref;

From 4a8852f8971dc78d194bd4ac736f077ef0de8415 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Mon, 21 Apr 2025 10:34:55 -0700
Subject: [PATCH 05/12] fixup! Address review comments

---
 .../rvv/fixed-vectors-interleaved-access.ll   | 97 ++++++++++++-------
 1 file changed, 61 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 3df75060808b1..c0a6456f01075 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -294,6 +294,31 @@ define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor4(ptr %ptr) {
   ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3
 }
 
+; TODO: Add more tests for vp.load/store + (de)interleave intrinsics with fixed vectors.
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vpload_factor4_intrinsics(ptr %ptr) {
+; CHECK-LABEL: vpload_factor4_intrinsics:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    ret
+  %wide.masked.load = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %ptr, <8 x i1> splat (i1 true), i32 8)
+  %d0 = call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %wide.masked.load)
+  %d0.0 = extractvalue { <4 x i32>, <4 x i32> } %d0, 0
+  %d0.1 = extractvalue { <4 x i32>, <4 x i32> } %d0, 1
+  %d1 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.0)
+  %t0 = extractvalue { <2 x i32>, <2 x i32> } %d1, 0
+  %t2 = extractvalue { <2 x i32>, <2 x i32> } %d1, 1
+  %d2 = call { <2 x i32>, <2 x i32> } @llvm.vector.deinterleave2.v4i32(<4 x i32> %d0.1)
+  %t1 = extractvalue { <2 x i32>, <2 x i32> } %d2, 0
+  %t3 = extractvalue { <2 x i32>, <2 x i32> } %d2, 1
+
+  %res0 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } poison, <2 x i32> %t0, 0
+  %res1 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res0, <2 x i32> %t1, 1
+  %res2 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res1, <2 x i32> %t2, 2
+  %res3 = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res2, <2 x i32> %t3, 3
+  ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %res3
+}
+
 define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @vpload_factor5(ptr %ptr) {
 ; CHECK-LABEL: vpload_factor5:
 ; CHECK:       # %bb.0:
@@ -402,8 +427,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a3, 12
 ; RV32-NEXT:    lui a6, 12291
-; RV32-NEXT:    lui a7, %hi(.LCPI19_0)
-; RV32-NEXT:    addi a7, a7, %lo(.LCPI19_0)
+; RV32-NEXT:    lui a7, %hi(.LCPI20_0)
+; RV32-NEXT:    addi a7, a7, %lo(.LCPI20_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; RV32-NEXT:    vle32.v v24, (a5)
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -488,12 +513,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
 ; RV32-NEXT:    lui a7, 49164
-; RV32-NEXT:    lui a1, %hi(.LCPI19_1)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_1)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_1)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_1)
 ; RV32-NEXT:    lui t2, 3
 ; RV32-NEXT:    lui t1, 196656
-; RV32-NEXT:    lui a4, %hi(.LCPI19_3)
-; RV32-NEXT:    addi a4, a4, %lo(.LCPI19_3)
+; RV32-NEXT:    lui a4, %hi(.LCPI20_3)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI20_3)
 ; RV32-NEXT:    lui t0, 786624
 ; RV32-NEXT:    li a5, 48
 ; RV32-NEXT:    lui a6, 768
@@ -672,8 +697,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v8, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v24, v8, v2
-; RV32-NEXT:    lui a1, %hi(.LCPI19_2)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_2)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_2)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_2)
 ; RV32-NEXT:    lui a3, 3073
 ; RV32-NEXT:    addi a3, a3, -1024
 ; RV32-NEXT:    vmv.s.x v0, a3
@@ -737,16 +762,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v28, v8, v3
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v28, v24
-; RV32-NEXT:    lui a1, %hi(.LCPI19_4)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_4)
-; RV32-NEXT:    lui a2, %hi(.LCPI19_5)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI19_5)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_4)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_4)
+; RV32-NEXT:    lui a2, %hi(.LCPI20_5)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_5)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v24, (a2)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI19_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_7)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_7)
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle16.v v10, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
@@ -774,14 +799,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl8r.v v0, (a1) # vscale x 64-byte Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vrgatherei16.vv v16, v0, v10
-; RV32-NEXT:    lui a1, %hi(.LCPI19_6)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_6)
-; RV32-NEXT:    lui a2, %hi(.LCPI19_8)
-; RV32-NEXT:    addi a2, a2, %lo(.LCPI19_8)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_6)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_6)
+; RV32-NEXT:    lui a2, %hi(.LCPI20_8)
+; RV32-NEXT:    addi a2, a2, %lo(.LCPI20_8)
 ; RV32-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV32-NEXT:    vle16.v v4, (a1)
-; RV32-NEXT:    lui a1, %hi(.LCPI19_9)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI19_9)
+; RV32-NEXT:    lui a1, %hi(.LCPI20_9)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI20_9)
 ; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV32-NEXT:    vle16.v v6, (a1)
 ; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
@@ -868,8 +893,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    li a4, 128
 ; RV64-NEXT:    lui a1, 1
 ; RV64-NEXT:    vle64.v v8, (a3)
-; RV64-NEXT:    lui a3, %hi(.LCPI19_0)
-; RV64-NEXT:    addi a3, a3, %lo(.LCPI19_0)
+; RV64-NEXT:    lui a3, %hi(.LCPI20_0)
+; RV64-NEXT:    addi a3, a3, %lo(.LCPI20_0)
 ; RV64-NEXT:    vmv.s.x v0, a4
 ; RV64-NEXT:    csrr a4, vlenb
 ; RV64-NEXT:    li a5, 61
@@ -1057,8 +1082,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl8r.v v16, (a2) # vscale x 64-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vslideup.vi v12, v16, 1, v0.t
-; RV64-NEXT:    lui a2, %hi(.LCPI19_1)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI19_1)
+; RV64-NEXT:    lui a2, %hi(.LCPI20_1)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_1)
 ; RV64-NEXT:    li a3, 192
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v6, (a2)
@@ -1092,8 +1117,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vrgatherei16.vv v24, v16, v6
 ; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
-; RV64-NEXT:    lui a2, %hi(.LCPI19_2)
-; RV64-NEXT:    addi a2, a2, %lo(.LCPI19_2)
+; RV64-NEXT:    lui a2, %hi(.LCPI20_2)
+; RV64-NEXT:    addi a2, a2, %lo(.LCPI20_2)
 ; RV64-NEXT:    li a3, 1040
 ; RV64-NEXT:    vmv.s.x v0, a3
 ; RV64-NEXT:    addi a1, a1, -2016
@@ -1177,12 +1202,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vs4r.v v8, (a1) # vscale x 32-byte Folded Spill
-; RV64-NEXT:    lui a1, %hi(.LCPI19_3)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_3)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_3)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_3)
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle16.v v20, (a1)
-; RV64-NEXT:    lui a1, %hi(.LCPI19_4)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_4)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_4)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_4)
 ; RV64-NEXT:    vle16.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 77
@@ -1233,8 +1258,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vl2r.v v8, (a1) # vscale x 16-byte Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vrgatherei16.vv v0, v16, v8
-; RV64-NEXT:    lui a1, %hi(.LCPI19_5)
-; RV64-NEXT:    addi a1, a1, %lo(.LCPI19_5)
+; RV64-NEXT:    lui a1, %hi(.LCPI20_5)
+; RV64-NEXT:    addi a1, a1, %lo(.LCPI20_5)
 ; RV64-NEXT:    vle16.v v20, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 61
@@ -1678,8 +1703,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI44_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI44_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI45_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI45_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1754,8 +1779,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI45_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI46_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI46_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36

From 5a09f293f97c2f9d3717212f0731c6c948ed494d Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 25 Apr 2025 10:35:45 -0700
Subject: [PATCH 06/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  | 422 ++++++++++----------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |   3 +-
 2 files changed, 211 insertions(+), 214 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 66c1cd98ef602..06598d14e8db0 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -249,26 +249,200 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
-/// Return true if it's an interleaving mask. For instance, 111000111000 is
-/// interleaved from three 1010 masks. \p SubMask returns the mask of individual
-/// lane.
-static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
-                                      SmallVectorImpl<Constant *> &LaneMask) {
-  unsigned LaneMaskLen = LaneMask.size();
-  if (auto *Splat = Mask->getSplatValue()) {
-    std::fill(LaneMask.begin(), LaneMask.end(), Splat);
-  } else {
-    for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
-      Constant *Ref = Mask->getAggregateElement(alignDown(Idx, Factor));
-      if (Ref != Mask->getAggregateElement(Idx))
+// For an (de)interleave tree like this:
+//
+//   A   C B   D
+//   |___| |___|
+//     |_____|
+//        |
+//     A B C D
+//
+//  We will get ABCD at the end while the leaf operands/results
+//  are ACBD, which are also what we initially collected in
+//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
+//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
+//  to reorder them by interleaving these values.
+static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
+  unsigned NumLeaves = SubLeaves.size();
+  if (NumLeaves == 2)
+    return;
+
+  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
+
+  const unsigned HalfLeaves = NumLeaves / 2;
+  // Visit the sub-trees.
+  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
+  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
+
+  SmallVector<Value *, 8> Buffer;
+  //    a0 a1 a2 a3 b0 b1 b2 b3
+  // -> a0 b0 a1 b1 a2 b2 a3 b3
+  for (unsigned i = 0U; i < NumLeaves; ++i)
+    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
+
+  llvm::copy(Buffer, SubLeaves.begin());
+}
+
+static bool
+getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
+                          SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+
+    // All the intermediate intrinsics will be deleted.
+    DeadInsts.push_back(Current);
+
+    for (unsigned I = 0; I < 2; ++I) {
+      Value *Op = Current->getOperand(I);
+      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
+        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
+          Queue.push_back(OpII);
+          continue;
+        }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
+        return false;
+
+      Operands.push_back(Op);
+    }
+  }
+
+  const unsigned Factor = Operands.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return false;
+
+  interleaveLeafValues(Operands);
+  return true;
+}
+
+static bool
+getVectorDeinterleaveFactor(IntrinsicInst *II,
+                            SmallVectorImpl<Value *> &Results,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
+  using namespace PatternMatch;
+  if (!II->hasNUses(2))
+    return false;
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+    assert(Current->hasNUses(2));
+
+    // All the intermediate intrinsics will be deleted from the bottom-up.
+    DeadInsts.insert(DeadInsts.begin(), Current);
+
+    ExtractValueInst *LHS = nullptr, *RHS = nullptr;
+    for (User *Usr : Current->users()) {
+      if (!isa<ExtractValueInst>(Usr))
+        return 0;
+
+      auto *EV = cast<ExtractValueInst>(Usr);
+      // Intermediate ExtractValue instructions will also be deleted.
+      DeadInsts.insert(DeadInsts.begin(), EV);
+      ArrayRef<unsigned> Indices = EV->getIndices();
+      if (Indices.size() != 1)
+        return false;
+
+      if (Indices[0] == 0 && !LHS)
+        LHS = EV;
+      else if (Indices[0] == 1 && !RHS)
+        RHS = EV;
+      else
+        return false;
+    }
+
+    // We have legal indices. At this point we're either going
+    // to continue the traversal or push the leaf values into Results.
+    for (ExtractValueInst *EV : {LHS, RHS}) {
+      // Continue the traversal. We're playing safe here and matching only the
+      // expression consisting of a perfectly balanced binary tree in which all
+      // intermediate values are only used once.
+      if (EV->hasOneUse() &&
+          match(EV->user_back(),
+                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
+          EV->user_back()->hasNUses(2)) {
+        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
+        Queue.push_back(EVUsr);
+        continue;
+      }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Results.empty() && EV->getType() != Results.back()->getType())
         return false;
-      LaneMask[Idx / Factor] = Ref;
+
+      // Save the leaf value.
+      Results.push_back(EV);
     }
   }
 
+  const unsigned Factor = Results.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return 0;
+
+  interleaveLeafValues(Results);
   return true;
 }
 
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      ElementCount LeafValueEC) {
+  using namespace llvm::PatternMatch;
+  if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
+    SmallVector<Value *, 8> Operands;
+    SmallVector<Instruction *, 8> DeadInsts;
+    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
+      assert(!Operands.empty());
+      if (Operands.size() == Factor && llvm::all_equal(Operands))
+        return Operands[0];
+    }
+  }
+
+  if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
+    if (auto *Splat = ConstMask->getSplatValue()) {
+      // All-ones or all-zeros mask.
+      return ConstantVector::getSplat(LeafValueEC, Splat);
+    } else if (LeafValueEC.isFixed()) {
+      unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+      SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
+      // If this is a fixed-length constant mask, each lane / leaf has to
+      // use the same mask. This is done by checking if every group with Factor
+      // number of elements in the interleaved mask has homogeneous values.
+      for (unsigned Idx = 0U, N = LeafMaskLen * Factor; Idx < N; ++Idx) {
+        Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor));
+        if (Ref != ConstMask->getAggregateElement(Idx))
+          return nullptr;
+        LeafMask[Idx / Factor] = Ref;
+      }
+
+      return ConstantVector::get(LeafMask);
+    }
+  }
+
+  return nullptr;
+}
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      VectorType *LeafValueTy) {
+  return getMask(WideMask, Factor, LeafValueTy->getElementCount());
+}
+
 bool InterleavedAccessImpl::lowerInterleavedLoad(
     Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
   if (isa<ScalableVectorType>(LoadOp->getType()))
@@ -279,7 +453,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       return false;
   } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
     assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
-    // Require a constant mask and evl.
+    // Require a constant mask.
     if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
       return false;
   } else {
@@ -373,29 +547,27 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   bool BinOpShuffleChanged =
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
 
-  // Check if the de-interleaved vp.load masks are the same.
-  unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
-  SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
-    if (!isInterleavedConstantMask(
-            Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
+    Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor,
+                              cast<VectorType>(Shuffles[0]->getType()));
+    if (!LaneMask)
       return false;
-  }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *LoadOp
+                      << "\n");
 
-  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
-    auto *MaskVec = ConstantVector::get(LaneMask);
     // Sometimes the number of Shuffles might be less than Factor, we have to
     // fill the gaps with null. Also, lowerDeinterleavedVPLoad
     // expects them to be sorted.
     SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
     for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
       ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
+    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
       // If Extracts is not empty, tryReplaceExtracts made changes earlier.
       return !Extracts.empty() || BinOpShuffleChanged;
   } else {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+
     // Try to create target specific intrinsics to replace the load and
     // shuffles.
     if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
@@ -539,17 +711,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
          "number of stored element should be a multiple of Factor");
 
   // Check if the de-interleaved vp.store masks are the same.
-  unsigned LaneMaskLen = NumStoredElements / Factor;
-  SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
-    if (!isInterleavedConstantMask(
-            Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
-      return false;
   }
 
-  LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
-
   if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+    unsigned LaneMaskLen = NumStoredElements / Factor;
+    Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor,
+                              ElementCount::getFixed(LaneMaskLen));
+    if (!LaneMask)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *StoreOp
+                      << "\n");
+
     IRBuilder<> Builder(VPStore);
     // We need to effectively de-interleave the shufflemask
     // because lowerInterleavedVPStore expected individual de-interleaved
@@ -568,11 +742,13 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
 
     // Try to create target specific intrinsics to replace the vp.store and
     // shuffle.
-    if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
-                                      NewShuffles))
+    if (!TLI->lowerInterleavedVPStore(VPStore, LaneMask, NewShuffles))
       // We already created new shuffles.
       return true;
   } else {
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp
+                      << "\n");
+
     // Try to create target specific intrinsics to replace the store and
     // shuffle.
     if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
@@ -585,184 +761,6 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
-// For an (de)interleave tree like this:
-//
-//   A   C B   D
-//   |___| |___|
-//     |_____|
-//        |
-//     A B C D
-//
-//  We will get ABCD at the end while the leaf operands/results
-//  are ACBD, which are also what we initially collected in
-//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
-//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
-//  to reorder them by interleaving these values.
-static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
-  unsigned NumLeaves = SubLeaves.size();
-  if (NumLeaves == 2)
-    return;
-
-  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
-
-  const unsigned HalfLeaves = NumLeaves / 2;
-  // Visit the sub-trees.
-  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
-  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
-
-  SmallVector<Value *, 8> Buffer;
-  //    a0 a1 a2 a3 b0 b1 b2 b3
-  // -> a0 b0 a1 b1 a2 b2 a3 b3
-  for (unsigned i = 0U; i < NumLeaves; ++i)
-    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
-
-  llvm::copy(Buffer, SubLeaves.begin());
-}
-
-static bool
-getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
-                          SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-
-    // All the intermediate intrinsics will be deleted.
-    DeadInsts.push_back(Current);
-
-    for (unsigned I = 0; I < 2; ++I) {
-      Value *Op = Current->getOperand(I);
-      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
-        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
-          Queue.push_back(OpII);
-          continue;
-        }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
-        return false;
-
-      Operands.push_back(Op);
-    }
-  }
-
-  const unsigned Factor = Operands.size();
-  // Currently we only recognize power-of-two factors.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 || !isPowerOf2_32(Factor))
-    return false;
-
-  interleaveLeafValues(Operands);
-  return true;
-}
-
-static bool
-getVectorDeinterleaveFactor(IntrinsicInst *II,
-                            SmallVectorImpl<Value *> &Results,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
-  using namespace PatternMatch;
-  if (!II->hasNUses(2))
-    return false;
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-    assert(Current->hasNUses(2));
-
-    // All the intermediate intrinsics will be deleted from the bottom-up.
-    DeadInsts.insert(DeadInsts.begin(), Current);
-
-    ExtractValueInst *LHS = nullptr, *RHS = nullptr;
-    for (User *Usr : Current->users()) {
-      if (!isa<ExtractValueInst>(Usr))
-        return 0;
-
-      auto *EV = cast<ExtractValueInst>(Usr);
-      // Intermediate ExtractValue instructions will also be deleted.
-      DeadInsts.insert(DeadInsts.begin(), EV);
-      ArrayRef<unsigned> Indices = EV->getIndices();
-      if (Indices.size() != 1)
-        return false;
-
-      if (Indices[0] == 0 && !LHS)
-        LHS = EV;
-      else if (Indices[0] == 1 && !RHS)
-        RHS = EV;
-      else
-        return false;
-    }
-
-    // We have legal indices. At this point we're either going
-    // to continue the traversal or push the leaf values into Results.
-    for (ExtractValueInst *EV : {LHS, RHS}) {
-      // Continue the traversal. We're playing safe here and matching only the
-      // expression consisting of a perfectly balanced binary tree in which all
-      // intermediate values are only used once.
-      if (EV->hasOneUse() &&
-          match(EV->user_back(),
-                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
-          EV->user_back()->hasNUses(2)) {
-        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
-        Queue.push_back(EVUsr);
-        continue;
-      }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Results.empty() && EV->getType() != Results.back()->getType())
-        return false;
-
-      // Save the leaf value.
-      Results.push_back(EV);
-    }
-  }
-
-  const unsigned Factor = Results.size();
-  // Currently we only recognize power-of-two factors.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 || !isPowerOf2_32(Factor))
-    return 0;
-
-  interleaveLeafValues(Results);
-  return true;
-}
-
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
-static Value *getMask(Value *WideMask, unsigned Factor,
-                      VectorType *LeafValueTy) {
-  using namespace llvm::PatternMatch;
-  if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
-    SmallVector<Value *, 8> Operands;
-    SmallVector<Instruction *, 8> DeadInsts;
-    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
-      assert(!Operands.empty());
-      if (Operands.size() == Factor && llvm::all_equal(Operands))
-        return Operands[0];
-    }
-  }
-
-  if (match(WideMask, m_AllOnes())) {
-    // Scale the vector length of all-ones mask.
-    ElementCount OrigEC =
-        cast<VectorType>(WideMask->getType())->getElementCount();
-    assert(OrigEC.getKnownMinValue() % Factor == 0);
-    return ConstantVector::getSplat(OrigEC.divideCoefficientBy(Factor),
-                                    cast<Constant>(WideMask)->getSplatValue());
-  }
-
-  return nullptr;
-}
-
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   Value *LoadedVal = DI->getOperand(0);
@@ -780,7 +778,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadedVal)) {
     if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
       return false;
-    // Check mask operand. Handle both all-true and interleaved mask.
+    // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
     Value *Mask = getMask(WideMask, Factor,
                           cast<VectorType>(DeinterleaveValues[0]->getType()));
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index df545f5e06c18..797ec6f8e0492 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23933,8 +23933,7 @@ bool RISCVTargetLowering::lowerInterleavedVPStore(
         Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask,
         Intrinsic::riscv_seg8_store_mask};
 
-    SmallVector<Value *, 8> Operands(InterleaveOperands.begin(),
-                                     InterleaveOperands.end());
+    SmallVector<Value *, 8> Operands(InterleaveOperands);
     Operands.append({Store->getArgOperand(1), Mask, EVL});
     Builder.CreateIntrinsic(FixedMaskedVssegIntrIds[Factor - 2], {FVTy, XLenTy},
                             Operands);

From a41bfb1ef6db37029e7184e41c62a45373175cf9 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 25 Apr 2025 11:56:30 -0700
Subject: [PATCH 07/12] fixup! Fix an error caused by the merge

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 49a8959b7e5dd..06598d14e8db0 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -780,7 +780,8 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
       return false;
     // Check mask operand. Handle both all-true/false and interleaved mask.
     Value *WideMask = VPLoad->getOperand(1);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(DeinterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 
@@ -831,7 +832,8 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
       return false;
 
     Value *WideMask = VPStore->getOperand(2);
-    Value *Mask = getMask(WideMask, Factor);
+    Value *Mask = getMask(WideMask, Factor,
+                          cast<VectorType>(InterleaveValues[0]->getType()));
     if (!Mask)
       return false;
 

From e46d311a2d1dde60640d25d99214f802a683daf7 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 30 Apr 2025 14:08:27 -0700
Subject: [PATCH 08/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 377 +++++++++++----------
 1 file changed, 190 insertions(+), 187 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 06598d14e8db0..9273e42e90716 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -249,195 +249,9 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
-// For an (de)interleave tree like this:
-//
-//   A   C B   D
-//   |___| |___|
-//     |_____|
-//        |
-//     A B C D
-//
-//  We will get ABCD at the end while the leaf operands/results
-//  are ACBD, which are also what we initially collected in
-//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
-//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
-//  to reorder them by interleaving these values.
-static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
-  unsigned NumLeaves = SubLeaves.size();
-  if (NumLeaves == 2)
-    return;
-
-  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
-
-  const unsigned HalfLeaves = NumLeaves / 2;
-  // Visit the sub-trees.
-  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
-  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
-
-  SmallVector<Value *, 8> Buffer;
-  //    a0 a1 a2 a3 b0 b1 b2 b3
-  // -> a0 b0 a1 b1 a2 b2 a3 b3
-  for (unsigned i = 0U; i < NumLeaves; ++i)
-    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
-
-  llvm::copy(Buffer, SubLeaves.begin());
-}
-
-static bool
-getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
-                          SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-
-    // All the intermediate intrinsics will be deleted.
-    DeadInsts.push_back(Current);
-
-    for (unsigned I = 0; I < 2; ++I) {
-      Value *Op = Current->getOperand(I);
-      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
-        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
-          Queue.push_back(OpII);
-          continue;
-        }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
-        return false;
-
-      Operands.push_back(Op);
-    }
-  }
-
-  const unsigned Factor = Operands.size();
-  // Currently we only recognize power-of-two factors.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 || !isPowerOf2_32(Factor))
-    return false;
-
-  interleaveLeafValues(Operands);
-  return true;
-}
-
-static bool
-getVectorDeinterleaveFactor(IntrinsicInst *II,
-                            SmallVectorImpl<Value *> &Results,
-                            SmallVectorImpl<Instruction *> &DeadInsts) {
-  assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
-  using namespace PatternMatch;
-  if (!II->hasNUses(2))
-    return false;
-
-  // Visit with BFS
-  SmallVector<IntrinsicInst *, 8> Queue;
-  Queue.push_back(II);
-  while (!Queue.empty()) {
-    IntrinsicInst *Current = Queue.front();
-    Queue.erase(Queue.begin());
-    assert(Current->hasNUses(2));
-
-    // All the intermediate intrinsics will be deleted from the bottom-up.
-    DeadInsts.insert(DeadInsts.begin(), Current);
-
-    ExtractValueInst *LHS = nullptr, *RHS = nullptr;
-    for (User *Usr : Current->users()) {
-      if (!isa<ExtractValueInst>(Usr))
-        return 0;
-
-      auto *EV = cast<ExtractValueInst>(Usr);
-      // Intermediate ExtractValue instructions will also be deleted.
-      DeadInsts.insert(DeadInsts.begin(), EV);
-      ArrayRef<unsigned> Indices = EV->getIndices();
-      if (Indices.size() != 1)
-        return false;
-
-      if (Indices[0] == 0 && !LHS)
-        LHS = EV;
-      else if (Indices[0] == 1 && !RHS)
-        RHS = EV;
-      else
-        return false;
-    }
-
-    // We have legal indices. At this point we're either going
-    // to continue the traversal or push the leaf values into Results.
-    for (ExtractValueInst *EV : {LHS, RHS}) {
-      // Continue the traversal. We're playing safe here and matching only the
-      // expression consisting of a perfectly balanced binary tree in which all
-      // intermediate values are only used once.
-      if (EV->hasOneUse() &&
-          match(EV->user_back(),
-                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
-          EV->user_back()->hasNUses(2)) {
-        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
-        Queue.push_back(EVUsr);
-        continue;
-      }
-
-      // If this is not a perfectly balanced tree, the leaf
-      // result types would be different.
-      if (!Results.empty() && EV->getType() != Results.back()->getType())
-        return false;
-
-      // Save the leaf value.
-      Results.push_back(EV);
-    }
-  }
-
-  const unsigned Factor = Results.size();
-  // Currently we only recognize power-of-two factors.
-  // FIXME: should we assert here instead?
-  if (Factor <= 1 || !isPowerOf2_32(Factor))
-    return 0;
-
-  interleaveLeafValues(Results);
-  return true;
-}
-
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
 static Value *getMask(Value *WideMask, unsigned Factor,
-                      ElementCount LeafValueEC) {
-  using namespace llvm::PatternMatch;
-  if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
-    SmallVector<Value *, 8> Operands;
-    SmallVector<Instruction *, 8> DeadInsts;
-    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
-      assert(!Operands.empty());
-      if (Operands.size() == Factor && llvm::all_equal(Operands))
-        return Operands[0];
-    }
-  }
-
-  if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
-    if (auto *Splat = ConstMask->getSplatValue()) {
-      // All-ones or all-zeros mask.
-      return ConstantVector::getSplat(LeafValueEC, Splat);
-    } else if (LeafValueEC.isFixed()) {
-      unsigned LeafMaskLen = LeafValueEC.getFixedValue();
-      SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
-      // If this is a fixed-length constant mask, each lane / leaf has to
-      // use the same mask. This is done by checking if every group with Factor
-      // number of elements in the interleaved mask has homogeneous values.
-      for (unsigned Idx = 0U, N = LeafMaskLen * Factor; Idx < N; ++Idx) {
-        Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor));
-        if (Ref != ConstMask->getAggregateElement(Idx))
-          return nullptr;
-        LeafMask[Idx / Factor] = Ref;
-      }
+                      ElementCount LeafValueEC);
 
-      return ConstantVector::get(LeafMask);
-    }
-  }
-
-  return nullptr;
-}
 static Value *getMask(Value *WideMask, unsigned Factor,
                       VectorType *LeafValueTy) {
   return getMask(WideMask, Factor, LeafValueTy->getElementCount());
@@ -761,6 +575,195 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   return true;
 }
 
+// For an (de)interleave tree like this:
+//
+//   A   C B   D
+//   |___| |___|
+//     |_____|
+//        |
+//     A B C D
+//
+//  We will get ABCD at the end while the leaf operands/results
+//  are ACBD, which are also what we initially collected in
+//  getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
+//  hooks (e.g. lowerDeinterleaveIntrinsicToLoad) expect ABCD, so we need
+//  to reorder them by interleaving these values.
+static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
+  unsigned NumLeaves = SubLeaves.size();
+  if (NumLeaves == 2)
+    return;
+
+  assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
+
+  const unsigned HalfLeaves = NumLeaves / 2;
+  // Visit the sub-trees.
+  interleaveLeafValues(SubLeaves.take_front(HalfLeaves));
+  interleaveLeafValues(SubLeaves.drop_front(HalfLeaves));
+
+  SmallVector<Value *, 8> Buffer;
+  //    a0 a1 a2 a3 b0 b1 b2 b3
+  // -> a0 b0 a1 b1 a2 b2 a3 b3
+  for (unsigned i = 0U; i < NumLeaves; ++i)
+    Buffer.push_back(SubLeaves[i / 2 + (i % 2 ? HalfLeaves : 0)]);
+
+  llvm::copy(Buffer, SubLeaves.begin());
+}
+
+static bool
+getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
+                          SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+
+    // All the intermediate intrinsics will be deleted.
+    DeadInsts.push_back(Current);
+
+    for (unsigned I = 0; I < 2; ++I) {
+      Value *Op = Current->getOperand(I);
+      if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
+        if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
+          Queue.push_back(OpII);
+          continue;
+        }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Operands.empty() && Op->getType() != Operands.back()->getType())
+        return false;
+
+      Operands.push_back(Op);
+    }
+  }
+
+  const unsigned Factor = Operands.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return false;
+
+  interleaveLeafValues(Operands);
+  return true;
+}
+
+static bool
+getVectorDeinterleaveFactor(IntrinsicInst *II,
+                            SmallVectorImpl<Value *> &Results,
+                            SmallVectorImpl<Instruction *> &DeadInsts) {
+  assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
+  using namespace PatternMatch;
+  if (!II->hasNUses(2))
+    return false;
+
+  // Visit with BFS
+  SmallVector<IntrinsicInst *, 8> Queue;
+  Queue.push_back(II);
+  while (!Queue.empty()) {
+    IntrinsicInst *Current = Queue.front();
+    Queue.erase(Queue.begin());
+    assert(Current->hasNUses(2));
+
+    // All the intermediate intrinsics will be deleted from the bottom-up.
+    DeadInsts.insert(DeadInsts.begin(), Current);
+
+    ExtractValueInst *LHS = nullptr, *RHS = nullptr;
+    for (User *Usr : Current->users()) {
+      if (!isa<ExtractValueInst>(Usr))
+        return 0;
+
+      auto *EV = cast<ExtractValueInst>(Usr);
+      // Intermediate ExtractValue instructions will also be deleted.
+      DeadInsts.insert(DeadInsts.begin(), EV);
+      ArrayRef<unsigned> Indices = EV->getIndices();
+      if (Indices.size() != 1)
+        return false;
+
+      if (Indices[0] == 0 && !LHS)
+        LHS = EV;
+      else if (Indices[0] == 1 && !RHS)
+        RHS = EV;
+      else
+        return false;
+    }
+
+    // We have legal indices. At this point we're either going
+    // to continue the traversal or push the leaf values into Results.
+    for (ExtractValueInst *EV : {LHS, RHS}) {
+      // Continue the traversal. We're playing safe here and matching only the
+      // expression consisting of a perfectly balanced binary tree in which all
+      // intermediate values are only used once.
+      if (EV->hasOneUse() &&
+          match(EV->user_back(),
+                m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
+          EV->user_back()->hasNUses(2)) {
+        auto *EVUsr = cast<IntrinsicInst>(EV->user_back());
+        Queue.push_back(EVUsr);
+        continue;
+      }
+
+      // If this is not a perfectly balanced tree, the leaf
+      // result types would be different.
+      if (!Results.empty() && EV->getType() != Results.back()->getType())
+        return false;
+
+      // Save the leaf value.
+      Results.push_back(EV);
+    }
+  }
+
+  const unsigned Factor = Results.size();
+  // Currently we only recognize power-of-two factors.
+  // FIXME: should we assert here instead?
+  if (Factor <= 1 || !isPowerOf2_32(Factor))
+    return 0;
+
+  interleaveLeafValues(Results);
+  return true;
+}
+
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
+static Value *getMask(Value *WideMask, unsigned Factor,
+                      ElementCount LeafValueEC) {
+  if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
+    SmallVector<Value *, 8> Operands;
+    SmallVector<Instruction *, 8> DeadInsts;
+    if (getVectorInterleaveFactor(IMI, Operands, DeadInsts)) {
+      assert(!Operands.empty());
+      if (Operands.size() == Factor && llvm::all_equal(Operands))
+        return Operands[0];
+    }
+  }
+
+  if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
+    if (auto *Splat = ConstMask->getSplatValue()) {
+      // All-ones or all-zeros mask.
+      return ConstantVector::getSplat(LeafValueEC, Splat);
+    } else if (LeafValueEC.isFixed()) {
+      unsigned LeafMaskLen = LeafValueEC.getFixedValue();
+      SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
+      // If this is a fixed-length constant mask, each lane / leaf has to
+      // use the same mask. This is done by checking if every group with Factor
+      // number of elements in the interleaved mask has homogeneous values.
+      for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
+        Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor));
+        if (Ref != ConstMask->getAggregateElement(Idx))
+          return nullptr;
+        LeafMask[Idx / Factor] = Ref;
+      }
+
+      return ConstantVector::get(LeafMask);
+    }
+  }
+
+  return nullptr;
+}
+
 bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
     IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
   Value *LoadedVal = DI->getOperand(0);

From b3838516c45848e7ec26fb9c20aa0c0fe482fc71 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 30 Apr 2025 14:10:29 -0700
Subject: [PATCH 09/12] fixup! fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9273e42e90716..7b77efbcf0774 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -249,6 +249,8 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
   return false;
 }
 
+// Return the corresponded deinterleaved mask, or nullptr if there is no valid
+// mask.
 static Value *getMask(Value *WideMask, unsigned Factor,
                       ElementCount LeafValueEC);
 
@@ -726,8 +728,6 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
   return true;
 }
 
-// Return the corresponded deinterleaved mask, or nullptr if there is no valid
-// mask.
 static Value *getMask(Value *WideMask, unsigned Factor,
                       ElementCount LeafValueEC) {
   if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {

From 6c03ffa91dae4c625f6d2bc41eefbd4afe738f68 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Thu, 1 May 2025 19:09:53 -0700
Subject: [PATCH 10/12] fixup! Address review comments

---
 llvm/include/llvm/CodeGen/TargetLowering.h  |  7 +-
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  | 79 ++++++++++-----------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.h   |  5 +-
 4 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3eee28741bf17..03099e9ad44dc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3210,15 +3210,14 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Lower a deinterleaved load to target specific intrinsics. Return
+  /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p Load is a vp.load instruction.
   /// \p Mask is a mask value
   /// \p DeinterleaveRes is a list of deinterleaved results.
-  virtual bool
-  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                           ArrayRef<Value *> DeinterleaveRes) const {
+  virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                                      ArrayRef<Value *> DeinterleaveRes) const {
     return false;
   }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 7b77efbcf0774..f70b8924ef21b 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -100,11 +100,11 @@ class InterleavedAccessImpl {
   unsigned MaxFactor = 0u;
 
   /// Transform an interleaved load into target specific intrinsics.
-  bool lowerInterleavedLoad(Instruction *LoadOp,
+  bool lowerInterleavedLoad(Instruction *Load,
                             SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform an interleaved store into target specific intrinsics.
-  bool lowerInterleavedStore(Instruction *StoreOp,
+  bool lowerInterleavedStore(Instruction *Store,
                              SmallSetVector<Instruction *, 32> &DeadInsts);
 
   /// Transform a load and a deinterleave intrinsic into target specific
@@ -260,14 +260,14 @@ static Value *getMask(Value *WideMask, unsigned Factor,
 }
 
 bool InterleavedAccessImpl::lowerInterleavedLoad(
-    Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
-  if (isa<ScalableVectorType>(LoadOp->getType()))
+    Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) {
+  if (isa<ScalableVectorType>(Load->getType()))
     return false;
 
-  if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+  if (auto *LI = dyn_cast<LoadInst>(Load)) {
     if (!LI->isSimple())
       return false;
-  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+  } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
     assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
     // Require a constant mask.
     if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
@@ -287,7 +287,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   // binop are the same load.
   SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
 
-  for (auto *User : LoadOp->users()) {
+  for (auto *User : Load->users()) {
     auto *Extract = dyn_cast<ExtractElementInst>(User);
     if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
       Extracts.push_back(Extract);
@@ -316,7 +316,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   unsigned Factor, Index;
 
   unsigned NumLoadElements =
-      cast<FixedVectorType>(LoadOp->getType())->getNumElements();
+      cast<FixedVectorType>(Load->getType())->getNumElements();
   auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
@@ -349,9 +349,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
     assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
 
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == Load)
       Indices.push_back(Index);
-    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
+    if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == Load)
       Indices.push_back(Index);
   }
 
@@ -361,32 +361,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
     return false;
 
   bool BinOpShuffleChanged =
-      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+      replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
-  if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+  if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
     Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor,
                               cast<VectorType>(Shuffles[0]->getType()));
     if (!LaneMask)
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *LoadOp
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
 
     // Sometimes the number of Shuffles might be less than Factor, we have to
-    // fill the gaps with null. Also, lowerDeinterleavedVPLoad
+    // fill the gaps with null. Also, lowerInterleavedVPLoad
     // expects them to be sorted.
     SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
     for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
       ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
-    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues))
       // If Extracts is not empty, tryReplaceExtracts made changes earlier.
       return !Extracts.empty() || BinOpShuffleChanged;
   } else {
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
 
     // Try to create target specific intrinsics to replace the load and
     // shuffles.
-    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
+    if (!TLI->lowerInterleavedLoad(cast<LoadInst>(Load), Shuffles, Indices,
                                    Factor))
       // If Extracts is not empty, tryReplaceExtracts made changes earlier.
       return !Extracts.empty() || BinOpShuffleChanged;
@@ -394,13 +393,13 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
 
   DeadInsts.insert_range(Shuffles);
 
-  DeadInsts.insert(LoadOp);
+  DeadInsts.insert(Load);
   return true;
 }
 
 bool InterleavedAccessImpl::replaceBinOpShuffles(
     ArrayRef<ShuffleVectorInst *> BinOpShuffles,
-    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
+    SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) {
   for (auto *SVI : BinOpShuffles) {
     BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
     Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -423,9 +422,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
                       << "\n  With    : " << *NewSVI1 << "\n    And   : "
                       << *NewSVI2 << "\n    And   : " << *NewBI << "\n");
     RecursivelyDeleteTriviallyDeadInstructions(SVI);
-    if (NewSVI1->getOperand(0) == LoadOp)
+    if (NewSVI1->getOperand(0) == Load)
       Shuffles.push_back(NewSVI1);
-    if (NewSVI2->getOperand(0) == LoadOp)
+    if (NewSVI2->getOperand(0) == Load)
       Shuffles.push_back(NewSVI2);
   }
 
@@ -497,13 +496,13 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
 }
 
 bool InterleavedAccessImpl::lowerInterleavedStore(
-    Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+    Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
   Value *StoredValue;
-  if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+  if (auto *SI = dyn_cast<StoreInst>(Store)) {
     if (!SI->isSimple())
       return false;
     StoredValue = SI->getValueOperand();
-  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+  } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
     // Require a constant mask.
     if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
@@ -526,23 +525,19 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
 
-  // Check if the de-interleaved vp.store masks are the same.
-  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
-  }
-
-  if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+  if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     unsigned LaneMaskLen = NumStoredElements / Factor;
     Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor,
                               ElementCount::getFixed(LaneMaskLen));
     if (!LaneMask)
       return false;
 
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *StoreOp
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
                       << "\n");
 
     IRBuilder<> Builder(VPStore);
     // We need to effectively de-interleave the shufflemask
-    // because lowerInterleavedVPStore expected individual de-interleaved
+    // because lowerInterleavedVPStore expects individual de-interleaved
     // values.
     SmallVector<Value *, 10> NewShuffles;
     SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
@@ -562,17 +557,16 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       // We already created new shuffles.
       return true;
   } else {
-    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
 
     // Try to create target specific intrinsics to replace the store and
     // shuffle.
-    if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
+    if (!TLI->lowerInterleavedStore(cast<StoreInst>(Store), SVI, Factor))
       return false;
   }
 
   // Already have a new target specific interleaved store. Erase the old store.
-  DeadInsts.insert(StoreOp);
+  DeadInsts.insert(Store);
   DeadInsts.insert(SVI);
   return true;
 }
@@ -741,20 +735,21 @@ static Value *getMask(Value *WideMask, unsigned Factor,
   }
 
   if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
-    if (auto *Splat = ConstMask->getSplatValue()) {
+    if (auto *Splat = ConstMask->getSplatValue())
       // All-ones or all-zeros mask.
       return ConstantVector::getSplat(LeafValueEC, Splat);
-    } else if (LeafValueEC.isFixed()) {
+
+    if (LeafValueEC.isFixed()) {
       unsigned LeafMaskLen = LeafValueEC.getFixedValue();
       SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
       // If this is a fixed-length constant mask, each lane / leaf has to
       // use the same mask. This is done by checking if every group with Factor
       // number of elements in the interleaved mask has homogeneous values.
       for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
-        Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor));
-        if (Ref != ConstMask->getAggregateElement(Idx))
+        Constant *C = ConstMask->getAggregateElement(Idx);
+        if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
           return nullptr;
-        LeafMask[Idx / Factor] = Ref;
+        LeafMask[Idx / Factor] = C;
       }
 
       return ConstantVector::get(LeafMask);
@@ -793,7 +788,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
 
     // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
     // TLI function to emit target-specific interleaved instruction.
-    if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
+    if (!TLI->lowerInterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
       return false;
 
   } else {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7ea64c9b7ca7b..d7c7b54f559b1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23999,7 +23999,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) {
 /// TODO: We probably can loosen the dependency on matching extractvalue when
 /// dealing with factor of 2 (extractvalue is still required for most of other
 /// factors though).
-bool RISCVTargetLowering::lowerDeinterleavedVPLoad(
+bool RISCVTargetLowering::lowerInterleavedVPLoad(
     VPIntrinsic *Load, Value *Mask,
     ArrayRef<Value *> DeinterleaveResults) const {
   assert(Mask && "Expect a valid mask");
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a77c6f1208643..7806724b263b3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -931,9 +931,8 @@ class RISCVTargetLowering : public TargetLowering {
   bool lowerInterleaveIntrinsicToStore(
       StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override;
 
-  bool
-  lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
-                           ArrayRef<Value *> DeinterleaveRes) const override;
+  bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
+                              ArrayRef<Value *> DeinterleaveRes) const override;
 
   bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
                                ArrayRef<Value *> InterleaveOps) const override;

From 92e8ad64dcc9e9033df2402460977d3432fc4212 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 7 May 2025 13:52:42 -0700
Subject: [PATCH 11/12] fixup! Address review comments

---
 llvm/lib/CodeGen/InterleavedAccessPass.cpp  | 10 +++++-----
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index f70b8924ef21b..04d89d61cb6a9 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -270,7 +270,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
   } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
     assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
     // Require a constant mask.
-    if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
+    if (!isa<ConstantVector>(VPLoad->getMaskParam()))
       return false;
   } else {
     llvm_unreachable("unsupported load operation");
@@ -364,8 +364,8 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
       replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);
 
   if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) {
-    Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor,
-                              cast<VectorType>(Shuffles[0]->getType()));
+    Value *LaneMask =
+        getMask(VPLoad->getMaskParam(), Factor, cast<VectorType>(VecTy));
     if (!LaneMask)
       return false;
 
@@ -505,7 +505,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
   } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
     // Require a constant mask.
-    if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
+    if (!isa<ConstantVector>(VPStore->getMaskParam()))
       return false;
     StoredValue = VPStore->getArgOperand(0);
   } else {
@@ -527,7 +527,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
 
   if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) {
     unsigned LaneMaskLen = NumStoredElements / Factor;
-    Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor,
+    Value *LaneMask = getMask(VPStore->getMaskParam(), Factor,
                               ElementCount::getFixed(LaneMaskLen));
     if (!LaneMask)
       return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d7c7b54f559b1..0aa86ad8ebd58 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24022,7 +24022,7 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad(
 
   IRBuilder<> Builder(Load);
 
-  Value *WideEVL = Load->getArgOperand(2);
+  Value *WideEVL = Load->getVectorLengthParam();
   // Conservatively check if EVL is a multiple of factor, otherwise some
   // (trailing) elements might be lost after the transformation.
   if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor))

From 1c0a4a190b77ed498294a310160f53410a604a77 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Wed, 7 May 2025 14:27:07 -0700
Subject: [PATCH 12/12] fixup! Add more tests for factor of 7 and 8

---
 .../rvv/fixed-vectors-interleaved-access.ll   | 75 ++++++++++++++++++-
 1 file changed, 71 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index c0a6456f01075..8ac4c7447c7d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1431,6 +1431,40 @@ define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2
   ret void
 }
 
+define void @store_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) {
+; CHECK-LABEL: store_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  store <14 x i16> %interleaved.vec, ptr %ptr
+  ret void
+}
+
+define void @store_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) {
+; CHECK-LABEL: store_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <16 x i16> %interleaved.vec, ptr %ptr
+  ret void
+}
+
 define void @vpstore_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: vpstore_factor2:
 ; CHECK:       # %bb.0:
@@ -1512,6 +1546,39 @@ define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %
   ret void
 }
 
+define void @vpstore_factor7(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6) {
+; CHECK-LABEL: vpstore_factor7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
+  tail call void @llvm.vp.store.v14i16.p0(<14 x i16> %interleaved.vec, ptr %ptr, <14 x i1> splat (i1 true), i32 14)
+  ret void
+}
+
+define void @vpstore_factor8(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5, <2 x i16> %v6, <2 x i16> %v7) {
+; CHECK-LABEL: vpstore_factor8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s2 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s3 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s4 = shufflevector <2 x i16> %v6, <2 x i16> %v7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s5 = shufflevector <4 x i16> %s2, <4 x i16> %s4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i16> %s3, <8 x i16> %s5, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  tail call void @llvm.vp.store.v16i16.p0(<16 x i16> %interleaved.vec, ptr %ptr, <16 x i1> splat (i1 true), i32 16)
+  ret void
+}
 
 define <4 x i32> @load_factor2_one_active(ptr %ptr) {
 ; CHECK-LABEL: load_factor2_one_active:
@@ -1703,8 +1770,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) {
 ; RV32-NEXT:    vle32.v v12, (a0), v0.t
 ; RV32-NEXT:    li a0, 36
 ; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    lui a1, %hi(.LCPI45_0)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI45_0)
+; RV32-NEXT:    lui a1, %hi(.LCPI49_0)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI49_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v21, (a1)
 ; RV32-NEXT:    vcompress.vm v8, v12, v11
@@ -1779,8 +1846,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) {
 ; RV32-NEXT:    vmv.s.x v10, a0
 ; RV32-NEXT:    li a0, 146
 ; RV32-NEXT:    vmv.s.x v11, a0
-; RV32-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-NEXT:    addi a0, a0, %lo(.LCPI46_0)
+; RV32-NEXT:    lui a0, %hi(.LCPI50_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI50_0)
 ; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; RV32-NEXT:    vle16.v v20, (a0)
 ; RV32-NEXT:    li a0, 36