Skip to content

Conversation

ram-NK
Copy link
Contributor

@ram-NK ram-NK commented Oct 17, 2025

  • [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

Motivation:
Given the following pack_LUT() function,

#define lowbit(x) (x&(-x))
void inline  pack_LUT(uint8_t* byte_query, uint8_t*  LUT){
constexpr uint32_t pos[16]={ 
 3 /*0000*/, 3 /*0001*/, 2 /*0010*/, 3 /*0011*/,
 1 /*0100*/, 3 /*0101*/, 2 /*0110*/, 3 /*0111*/,
 0 /*1000*/, 3 /*1001*/, 2 /*1010*/, 3/ *1011*/,
 1 /*1100*/, 3 /*1101*/, 2 /*1110*/, 3 /*1111*/,
 };
    for(int i=0;i<M;i++){
        LUT[0] = 0;
        for(int j=1;j<16;j++){
            LUT[j] = LUT[j - lowbit(j)] + byte_query[pos[j]];
        }
        LUT        += 16;
        byte_query += 4;
    }
}

The IR that is right before loop vectorization is shown below. Here the inner loop is fully unrolled. 16 consecutive memory were store with separate 16 store instructions.

for.body:                                         ; preds = %entry, %for.body
  %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ]
  %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ]
  %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ]
  store i8 0, ptr %out.addr.032, align 1
  %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3
  %0 = load i8, ptr %arrayidx10, align 1
  %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1
  store i8 %0, ptr %arrayidx14, align 1
  %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2
  %1 = load i8, ptr %arrayidx10.1, align 1
  %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2
  store i8 %1, ptr %arrayidx14.1, align 1
  %add.2 = add i8 %0, %1
  %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3
  store i8 %add.2, ptr %arrayidx14.2, align 1
  %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1
  %2 = load i8, ptr %arrayidx10.3, align 1
  %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4
  store i8 %2, ptr %arrayidx14.3, align 1
  %add.4 = add i8 %0, %2
  %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5
  store i8 %add.4, ptr %arrayidx14.4, align 1
  %add.5 = add i8 %1, %2
  %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6
  store i8 %add.5, ptr %arrayidx14.5, align 1
  %add.6 = add i8 %0, %add.5
  %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7
  store i8 %add.6, ptr %arrayidx14.6, align 1
  %3 = load i8, ptr %in.addr.031, align 1
  %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8
  store i8 %3, ptr %arrayidx14.7, align 1
  %add.8 = add i8 %0, %3
  %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9
  store i8 %add.8, ptr %arrayidx14.8, align 1
  %add.9 = add i8 %1, %3
  %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10
  store i8 %add.9, ptr %arrayidx14.9, align 1
  %add.10 = add i8 %0, %add.9
  %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11
  store i8 %add.10, ptr %arrayidx14.10, align 1
  %add.11 = add i8 %2, %3
  %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12
  store i8 %add.11, ptr %arrayidx14.11, align 1
  %add.12 = add i8 %0, %add.11
  %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13
  store i8 %add.12, ptr %arrayidx14.12, align 1
  %add.13 = add i8 %1, %add.11
  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
  store i8 %add.13, ptr %arrayidx14.13, align 1
  %add.14 = add i8 %0, %add.13
  %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14
  store i8 %add.13, ptr %arrayidx14.13, align 1
  %add.14 = add i8 %0, %add.13
  %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15
  store i8 %add.14, ptr %arrayidx14.14, align 1
  %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16
  %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4
  %inc17 = add nuw nsw i32 %i.033, 1
  %exitcond.not = icmp eq i32 %inc17, 32
  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0

If this 16 stores can be interleaved then outer loop can be vectorized. This will improve the performance of the loop by more than 2%.

Assembly of the loop after vectorized and interleaved by 16.

.LBB0_5:                                // =>This Inner Loop Header: Depth=1
        add     x11, x8, x9
        add     x9, x9, #64
        ld4     { v3.16b, v4.16b, v5.16b, v6.16b }, [x11]
        mov     x11, x10
        cmp     x9, #128
        add     v7.16b, v3.16b, v4.16b
        add     v1.16b, v4.16b, v5.16b
        add     v16.16b, v3.16b, v5.16b
        add     v17.16b, v4.16b, v6.16b
        add     v18.16b, v3.16b, v6.16b
        add     v20.16b, v5.16b, v6.16b
        zip1    v21.16b, v0.16b, v3.16b
        zip2    v3.16b, v0.16b, v3.16b
        add     v2.16b, v7.16b, v5.16b
        add     v19.16b, v7.16b, v6.16b
        add     v22.16b, v1.16b, v6.16b
        add     v23.16b, v16.16b, v6.16b
        zip1    v24.16b, v4.16b, v7.16b
        zip1    v26.16b, v6.16b, v18.16b
        zip1    v28.16b, v5.16b, v16.16b
        zip2    v7.16b, v4.16b, v7.16b
        zip2    v18.16b, v6.16b, v18.16b
        add     v25.16b, v2.16b, v6.16b
        zip1    v27.16b, v17.16b, v19.16b
        zip1    v29.16b, v1.16b, v2.16b
        zip1    v30.16b, v20.16b, v23.16b
        zip2    v16.16b, v5.16b, v16.16b
        zip2    v5.16b, v17.16b, v19.16b
        zip1    v8.16b, v21.16b, v24.16b
        zip2    v1.16b, v1.16b, v2.16b
        zip2    v4.16b, v20.16b, v23.16b
        zip1    v31.16b, v22.16b, v25.16b
        zip2    v12.16b, v21.16b, v24.16b
        zip2    v2.16b, v22.16b, v25.16b
        zip1    v9.16b, v26.16b, v27.16b
        zip2    v13.16b, v26.16b, v27.16b
        zip1    v19.16b, v3.16b, v7.16b
        zip1    v10.16b, v28.16b, v29.16b
        zip2    v14.16b, v28.16b, v29.16b
        zip1    v20.16b, v18.16b, v5.16b
        zip2    v23.16b, v3.16b, v7.16b
        zip1    v21.16b, v16.16b, v1.16b
        zip1    v11.16b, v30.16b, v31.16b
        zip2    v15.16b, v30.16b, v31.16b
        zip2    v24.16b, v18.16b, v5.16b
        zip1    v22.16b, v4.16b, v2.16b
        zip2    v25.16b, v16.16b, v1.16b
        zip2    v26.16b, v4.16b, v2.16b
        st4     { v8.16b, v9.16b, v10.16b, v11.16b }, [x11], #64
        st4     { v12.16b, v13.16b, v14.16b, v15.16b }, [x11]
        add     x11, x10, #128
        st4     { v19.16b, v20.16b, v21.16b, v22.16b }, [x11]
        add     x11, x10, #192
        add     x10, x10, #256
        st4     { v23.16b, v24.16b, v25.16b, v26.16b }, [x11]
        b.ne    .LBB0_5

For loop M == 16 case
Not vectorize loop need 16 X 16 store = 256 stores needed.
Vectorized loop need 16 zip1 + 16 zip2 + 4 st4 instructions

@llvmbot
Copy link
Member

llvmbot commented Oct 17, 2025

@llvm/pr-subscribers-vectorizers
@llvm/pr-subscribers-llvm-transforms

@llvm/pr-subscribers-backend-aarch64

Author: Ramkrishnan (ram-NK)

Changes
  • [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.

Patch is 34.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164000.diff

9 Files Affected:

  • (modified) llvm/include/llvm/CodeGen/TargetLowering.h (+5)
  • (modified) llvm/lib/CodeGen/InterleavedAccessPass.cpp (+11-2)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+138-2)
  • (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.h (+7)
  • (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+32-5)
  • (modified) llvm/test/CodeGen/AArch64/vldn_shuffle.ll (+103)
  • (added) llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll (+117)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll (+1-1)
  • (modified) llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll (+8-8)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..86956d1c64451 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3206,6 +3206,11 @@ class LLVM_ABI TargetLoweringBase {
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
+  /// Return true if the target can interleave data with shuffles.
+  virtual bool isProfitableToInterleaveWithGatherScatter() const {
+    return false;
+  }
+
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index a6a9b5058ad94..c7d44c01f99f3 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor) {
+                               unsigned MaxFactor,
+                               bool InterleaveWithShuffles) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
+  if (InterleaveWithShuffles) {
+    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+      Factor = i * MaxFactor;
+      if (SVI->isInterleave(Factor))
+        return true;
+    }
+  }
   return false;
 }
 
@@ -530,7 +538,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+                          TLI->isProfitableToInterleaveWithGatherScatter()))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 662d84b7a60a8..f26eef3ab61e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18023,11 +18023,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
 
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
+
+  if (isProfitableToInterleaveWithGatherScatter() &&
+      Factor > getMaxSupportedInterleaveFactor())
+    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
+  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
+         "Invalid interleave factor");
+
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18173,6 +18179,136 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
+/// If the interleaved vector elements are greter than supported MaxFactor
+/// then, interleaving the data with additional shuffles can be used to
+/// achieve the same.
+/// Below shows how 8 interleaved data are shuffled to store with stN
+/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7
+///      v0      v4      v2      v6      v1      v5      v3      v7
+///      |       |       |       |       |       |       |       |
+///       \     /         \     /         \     /         \     /
+///     [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7]==> stN = 4
+///          |               |              |                 |
+///           \             /                \               /
+///            \           /                  \             /
+///             \         /                    \           /
+///         [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
+///
+/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store
+/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4.
+///
+/// In stN = 2 level first upper half of interleaved data V0,V1 is store
+/// with one st2 instruction. Second set V2,V3 with store with another st2.
+/// Total of 4 st2 are  required.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
+
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+    return false;
+
+  if (UseScalable)
+    return false;
+
+  SmallVector<Value *, 8> Shufflelist;
+  Shufflelist.push_back(SVI);
+  unsigned ConcatLevel = Factor;
+  while (ConcatLevel > 1) {
+    SmallVector<Value *, 8> ShufflelistIntermediate;
+    ShufflelistIntermediate = Shufflelist;
+    Shufflelist.clear();
+    while (!ShufflelistIntermediate.empty()) {
+      ShuffleVectorInst *SFL =
+          dyn_cast<ShuffleVectorInst>(ShufflelistIntermediate[0]);
+      if (!SFL)
+        break;
+      ShufflelistIntermediate.erase(ShufflelistIntermediate.begin());
+
+      Value *Op0 = SFL->getOperand(0);
+      Value *Op1 = SFL->getOperand(1);
+
+      Shufflelist.push_back(dyn_cast<Value>(Op0));
+      Shufflelist.push_back(dyn_cast<Value>(Op1));
+    }
+    if (!ShufflelistIntermediate.empty()) {
+      Shufflelist = ShufflelistIntermediate;
+      break;
+    }
+    ConcatLevel = ConcatLevel >> 1;
+  }
+
+  if (Shufflelist.size() != Factor)
+    return false;
+
+  IRBuilder<> Builder(SI);
+  auto Mask = createInterleaveMask(LaneLen, 2);
+  SmallVector<int, 16> UpperHalfMask, LowerHalfMask;
+  for (unsigned i = 0; i < (2 * LaneLen); i++)
+    if (i < LaneLen)
+      LowerHalfMask.push_back(Mask[i]);
+    else
+      UpperHalfMask.push_back(Mask[i]);
+
+  unsigned InterleaveFactor = Factor >> 1;
+  while (InterleaveFactor >= MaxSupportedFactor) {
+    SmallVector<Value *, 8> ShufflelistIntermediate;
+    for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
+      for (unsigned i = 0; i < InterleaveFactor; i++) {
+        auto *Shuffle = Builder.CreateShuffleVector(
+            Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+            LowerHalfMask);
+        ShufflelistIntermediate.push_back(Shuffle);
+      }
+      for (unsigned i = 0; i < InterleaveFactor; i++) {
+        auto *Shuffle = Builder.CreateShuffleVector(
+            Shufflelist[i + j], Shufflelist[i + j + InterleaveFactor],
+            UpperHalfMask);
+        ShufflelistIntermediate.push_back(Shuffle);
+      }
+    }
+
+    Shufflelist = ShufflelistIntermediate;
+    InterleaveFactor = InterleaveFactor >> 1;
+  }
+
+  Type *PtrTy = SI->getPointerOperandType();
+  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+  Value *BaseAddr = SI->getPointerOperand();
+  Function *StNFunc = getStructuredStoreFunction(
+      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+  for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
+    SmallVector<Value *, 5> Ops;
+    for (unsigned j = 0; j < MaxSupportedFactor; j++)
+      Ops.push_back(Shufflelist[i * MaxSupportedFactor + j]);
+
+    if (i > 0) {
+      // We will compute the pointer operand of each store from the original
+      // base address using GEPs. Cast the base address to a pointer to the
+      // scalar  element type.
+      BaseAddr = Builder.CreateConstGEP1_32(
+          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+    }
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9ffc47aa..867e01664eaae 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
+  bool isProfitableToInterleaveWithGatherScatter() const override {
+    return true;
+  }
+
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
+  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+                                        unsigned Factor) const;
+
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 479e34515fc8a..25055598a58f5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4801,11 +4801,35 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  unsigned NumLoadStores = 1;
+  InstructionCost ShuffleCost = 0;
+  bool isInterleaveWithShuffle = false;
+  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  auto *SubVecTy =
+      VectorType::get(VecVTy->getElementType(),
+                      VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+  if (TLI->isProfitableToInterleaveWithGatherScatter() &&
+      Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
+      Factor > MaxSupportedFactor) {
+    isInterleaveWithShuffle = true;
+    SmallVector<int, 16> Mask;
+    // preparing interleave Mask.
+    for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
+         i++)
+      for (unsigned j = 0; j < 2; j++)
+        Mask.push_back(j * Factor + i);
+
+    NumLoadStores = Factor / MaxSupportedFactor;
+    ShuffleCost =
+        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+                                 Mask, CostKind, 0, SubVecTy));
+  }
+
+  if (!UseMaskForGaps &&
+      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
-    auto *SubVecTy =
-        VectorType::get(VecVTy->getElementType(),
-                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -4813,7 +4837,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+      return (Factor *
+              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+              NumLoadStores) +
+             ShuffleCost;
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 3685e9cf85bd6..6d0a0300e0a91 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -730,6 +730,109 @@ entry:
   ret void
 }
 
+define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
+                                     <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
+; CHECK-LABEL: store_factor8:
+; CHECK:       .Lfunc_begin17:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:  zip1	[[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
+; CHECK-NEXT:  zip2	[[V5:.*s]], [[I1]], [[I5]]
+; CHECK-NEXT:  zip1	[[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
+; CHECK-NEXT:  zip2 [[V6:.*s]], [[I2]], [[I6]]
+; CHECK-NEXT:  zip1	[[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
+; CHECK-NEXT:  zip2	[[V7:.*s]], [[I3]], [[I7]]
+; CHECK-NEXT:  zip1	[[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
+; CHECK-NEXT:  zip2	[[V8:.*s]], [[I4]], [[I8]]
+; CHECK-NEXT:  st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
+; CHECK-NEXT:  st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
+; CHECK-NEXT:  ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <32 x i32> %interleaved.vec, ptr %ptr, align 4
+  ret void
+}
+
+define void @store_factor16(ptr %ptr, <4 x i32> %a0,  <4 x i32> %a1,  <4 x i32> %a2,  <4 x i32> %a3,
+                                      <4 x i32> %a4,  <4 x i32> %a5,  <4 x i32> %a6,  <4 x i32> %a7,
+                                      <4 x i32> %a8,  <4 x i32> %a9,  <4 x i32> %a10, <4 x i32> %a11,
+                                      <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
+; CHECK-LABEL: store_factor16:
+; CHECK:       .Lfunc_begin18:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK:      	zip1	[[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
+; CHECK-NEXT:  	zip1	[[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
+; CHECK-NEXT:  	zip1	[[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
+; CHECK-NEXT:  	zip1	[[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
+; CHECK-NEXT:  	zip1	[[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
+; CHECK-NEXT:  	zip1	[[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
+; CHECK-NEXT:  	zip2	[[V09:.*s]], [[I01]], [[I09]]
+; CHECK-NEXT:  	zip1	[[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
+; CHECK-NEXT:  	zip1	[[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
+; CHECK-NEXT:  	zip2	[[V11:.*s]], [[I03]], [[I11]]
+; CHECK-NEXT:  	zip2	[[V12:.*s]], [[I04]], [[I12]]
+; CHECK-NEXT:  	zip2	[[V13:.*s]], [[I05]], [[I13]]
+; CHECK-NEXT:  	zip2	[[V10:.*s]], [[I02]], [[I10]]
+; CHECK-NEXT:  	zip1	[[V17:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip2	[[V21:.*s]], [[V01]], [[V05]]
+; CHECK-NEXT:  	zip2	[[V14:.*s]], [[I06]], [[I14]]
+; CHECK-NEXT:  	zip1	[[V18:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip2	[[V22:.*s]], [[V02]], [[V06]]
+; CHECK-NEXT:  	zip2	[[V15:.*s]], [[I07]], [[I15]]
+; CHECK-NEXT:  	zip1	[[V19:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip2	[[V23:.*s]], [[V03]], [[V07]]
+; CHECK-NEXT:  	zip2	[[V16:.*s]], [[I08]], [[I16]]
+; CHECK-NEXT:  	zip1	[[V20:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip2	[[V24:.*s]], [[V04]], [[V08]]
+; CHECK-NEXT:  	zip1	[[V25:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	zip1	[[V26:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip1	[[V27:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip1	[[V28:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	st4	{ [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
+; CHECK-NEXT:  	ldp	d9, d8, [sp, #16]               // 16-byte Folded Reload
+; CHECK-NEXT:  	st4	{ [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
+; CHECK-NEXT:  	zip2	 [[V29:.*s]], [[V09]], [[V13]]
+; CHECK-NEXT:  	add	x8, x0, #128
+; CHECK-NEXT:  	zip2	[[V30:.*s]], [[V10]], [[V14]]
+; CHECK-NEXT:  	zip2	[[V31:.*s]], [[V11]], [[V15]]
+; CHECK-NEXT:  	zip2	[[V32:.*s]], [[V12]], [[V16]]
+; CHECK-NEXT:  	st4	{ [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
+; CHECK-NEXT:  	add	x8, x0, #192
+; CHECK-NEXT:  	st4	{ [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
+; CHECK-NEXT:  	ldp	d11, d10, [sp], #32             // 16-byte Folded Reload
+; CHECK-NEXT:  	ret
+
+  %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+
+  %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32>  <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, ...
[truncated]

Copy link
Member

@Rajveer100 Rajveer100 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't taken a deep look yet, but some initial thoughts.

Also, any particular place of motivation where this pattern is used extensively?

- [AArch64]: Interleaved access store can handle more elements than
        target supported maximum interleaved factor with shuffles.
@ram-NK ram-NK force-pushed the interleaved-store-with-shuffle branch from 53cb483 to 45eb570 Compare October 20, 2025 16:04
@ram-NK
Copy link
Contributor Author

ram-NK commented Oct 20, 2025

I haven't taken a deep look yet, but some initial thoughts.

Also, any particular place of motivation where this pattern is used extensively?

I added details in the PR description.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants