Improve strided access vectorization for AArch64 SVE

kinoshita-fj · kinoshita-fj · commit 441f8831e9f2 · 2025-10-20T04:51:40.000Z
Currently, LLVM vectorizes strided accesses with SVE as follows. ```c void func(double* restrict a, double* b, int n) { for (int i = 0; i < n; i++) { a[i] = b[i * 10] + 1; } } ``` => ``` ... index z1.d, #0, llvm#1 loop: add z2.d, z1.d, z0.d mul z1.d, z1.d, llvm#80 ld1d { z1.d }, p0/z, [x1, z1.d] ... mov z1.d, z2.d ... ``` This generated code is inefficient because it performs address calculation inside the loop using vector instructions. This can lead to performance degradation. Ideally, we want to generate efficient instructions that keep the offset vector `z1` constant and update the base register `x1` with a scalar instruction. ``` ... index z1.d, #0, llvm#10 loop: ld1d z2.d, p7/z, [x1, z1.d, lsl 3] ... add x1, x1, x2 ... ``` This patch enables strided accesses to be vectorized efficiently as shown above. This patch is based on llvm#147297. llvm#147297 detects strided accesses and converts them into stride recipes. This patch then changes it to a legal and efficient sequence of recipes for AArch64. I am making this patch as a draft for the following reasons: - I have not yet been able to create sufficient test cases for this patch. - I have not yet been able to confirm that there are no performance degradations.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -852,6 +852,11 @@ class TargetTransformInfo {
   /// Return true if the target supports strided load.
   LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
 
+  /// Return true if the target benefits from the generation of a more
+  /// efficient instruction sequence for strided accesses.
+  LLVM_ABI bool preferToUseStrideRecipesForVectorization(Type *DataType,
+                                                         Align Alignment) const;
+
   /// Return true is the target supports interleaved access for the given vector
   /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
   /// address space \p AddrSpace.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -374,6 +374,11 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  virtual bool preferToUseStrideRecipesForVectorization(Type *DataType,
+                                                        Align Alignment) const {
+    return false;
+  }
+
   virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
                                             Align Alignment,
                                             unsigned AddrSpace) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -532,6 +532,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
   return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
 }
 
+bool TargetTransformInfo::preferToUseStrideRecipesForVectorization(
+    Type *DataType, Align Alignment) const {
+  return TTIImpl->preferToUseStrideRecipesForVectorization(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalInterleavedAccessType(
     VectorType *VTy, unsigned Factor, Align Alignment,
     unsigned AddrSpace) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -346,6 +346,12 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     return isLegalMaskedGatherScatter(DataType);
   }
 
+  bool
+  preferToUseStrideRecipesForVectorization(Type *DataType,
+                                           Align Alignment) const override {
+    return isLegalMaskedGatherScatter(DataType);
+  }
+
   bool isLegalBroadcastLoad(Type *ElementTy,
                             ElementCount NumElements) const override {
     // Return true if we can generate a `ld1r` splat load instruction.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8627,6 +8627,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
                            CostCtx, Range);
 
+  VPlanTransforms::runPass(VPlanTransforms::legalizeStridedAccess, *Plan,
+                           CostCtx, Range);
+
   for (ElementCount VF : Range)
     Plan->addVF(VF);
   Plan->setName("Initial VPlan");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlanTransforms.h"
+#include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
@@ -20,6 +21,7 @@
 #include "VPlanHelpers.h"
 #include "VPlanPatternMatch.h"
 #include "VPlanUtils.h"
+#include "VPlanValue.h"
 #include "VPlanVerifier.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -31,11 +33,17 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <cstdint>
+#include <limits>
+#include <optional>
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
@@ -4336,14 +4344,20 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
       auto IsProfitable = [&](ElementCount VF) -> bool {
         Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
         const Align Alignment = getLoadStoreAlignment(&Ingredient);
-        if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
-          return false;
-        const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
-        const InstructionCost StridedLoadStoreCost =
-            Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
-                                           MemR->isMasked(), Alignment,
-                                           Ctx.CostKind, &Ingredient);
-        return StridedLoadStoreCost < CurrentCost;
+        if (Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) {
+          const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx);
+          const InstructionCost StridedLoadStoreCost =
+              Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV,
+                                             MemR->isMasked(), Alignment,
+                                             Ctx.CostKind, &Ingredient);
+          return StridedLoadStoreCost < CurrentCost;
+        }
+
+        if (Ctx.TTI.preferToUseStrideRecipesForVectorization(DataTy,
+                                                             Alignment)) {
+          return true;
+        }
+        return false;
       };
 
       if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable,
@@ -4393,3 +4407,46 @@ void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
   for (auto *V : PossiblyDead)
     recursivelyDeleteDeadRecipes(V);
 }
+
+void VPlanTransforms::legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
+                                            VFRange &Range) {
+  VPTypeAnalysis TypeInfo(Plan);
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *StrideR = dyn_cast<VPWidenStridedLoadRecipe>(&R);
+      if (!StrideR)
+        continue;
+
+      Instruction &Ingredient = StrideR->getIngredient();
+      auto NeedsLegalize = [&](ElementCount VF) -> bool {
+        Type *DataTy = toVectorTy(getLoadStoreType(&Ingredient), VF);
+        const Align Alignment = getLoadStoreAlignment(&Ingredient);
+        if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment))
+          return true;
+        return false;
+      };
+
+      if (!LoopVectorizationPlanner::getDecisionAndClampRange(NeedsLegalize,
+                                                              Range))
+        continue;
+
+      auto *Ptr = cast<VPVectorPointerRecipe>(StrideR->getAddr());
+      auto *Stride = StrideR->getStride();
+      Type *StrideTy = TypeInfo.inferScalarType(Stride);
+
+      VPBuilder Builder(StrideR);
+      auto *Step =
+          Builder.createNaryOp(VPInstruction::StepVector, {}, StrideTy);
+      VPValue *Offset = Builder.createNaryOp(Instruction::Mul, {Step, Stride});
+      VPValue *GEP = Builder.createWidePtrAdd(Ptr->getOperand(0), Offset);
+
+      auto *LoadR = new VPWidenLoadRecipe(*cast<LoadInst>(&Ingredient), GEP,
+                                          StrideR->getMask(), false, false,
+                                          *StrideR, StrideR->getDebugLoc());
+      Builder.insert(LoadR);
+      StrideR->replaceAllUsesWith(LoadR);
+      StrideR->eraseFromParent();
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -252,6 +252,11 @@ struct VPlanTransforms {
   static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx,
                                        VFRange &Range);
 
+  /// Legalize strided access recipes for targets that do not support
+  /// them natively.
+  static void legalizeStridedAccess(VPlan &Plan, VPCostContext &Ctx,
+                                    VFRange &Range);
+
   /// Remove dead recipes from \p Plan.
   static void removeDeadRecipes(VPlan &Plan);
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strided-accesses.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple aarch64-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -S %s | FileCheck %s
+
+; Test case with strided access (fixed 80-byte stride)
+
+; void constant_stride_i64(double* a, double* b, int n) {
+;    for (int i = 0; i < n; i++) {
+;     a[i] = b[i * 10] + 1;
+;    }
+; }
+
+
+define void @constant_stride_i64(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @constant_stride_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw nsw i64 [[INDEX]], 80
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[B:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], <vscale x 4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP9]], ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx.idx = mul nuw nsw i64 %indvars.iv, 80
+  %arrayidx = getelementptr inbounds nuw i8, ptr %b, i64 %arrayidx.idx
+  %0 = load i64, ptr %arrayidx, align 8
+  %add = add nsw i64 %0, 1
+  %arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
+  store i64 %add, ptr %arrayidx2, align 8 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+; Test stride requiring scaling (10 x i64 stride)
+
+; void constant_stride_i64_scaled(double* a, double* b, int n) {
+;    for (int i = 0; i < n; i++) {
+;     a[i] = b[i * 10] + 1;
+;    }
+; }
+
+define void @constant_stride_i64_scaled(ptr noalias nocapture writeonly %a, ptr noalias nocapture readonly %b, i64 %n) #0 {
+; CHECK-LABEL: @constant_stride_i64_scaled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP4]], splat (i64 80)
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = mul i64 [[INDEX]], 80
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP5]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 8, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], splat (i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x i64> [[TMP8]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx.idx = mul nuw nsw i64 %indvars.iv, 10
+  %arrayidx = getelementptr i64, ptr %b, i64 %arrayidx.idx
+  %0 = load i64, ptr %arrayidx, align 8
+  %add = add nsw i64 %0, 1
+  %arrayidx2 = getelementptr inbounds nuw i64, ptr %a, i64 %indvars.iv
+  store i64 %add, ptr %arrayidx2, align 8 
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { vscale_range(1, 16) }
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 4}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 1}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -101,36 +101,38 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> %2, splat (i64 12)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr @CD, i64 [[DOTIDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr @AB_i16, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], <vscale x 4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr @AB_i16, <vscale x 4 x i64> [[TMP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 2, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr @CD, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP13]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP11]], <vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
+; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll