lukel97
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.cpp‎
Lines changed: 0 additions & 15 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.cpp‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 5 additions & 11 deletions b/‎llvm/lib/Transforms/Vectorize/VPlan.h‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 0 additions & 86 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp‎
Lines changed: 0 additions & 86 deletions
diff --git a/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 107 additions & 0 deletions b/‎llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll‎
Lines changed: 7 additions & 10 deletions b/‎llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll‎
Lines changed: 9 additions & 9 deletions b/‎llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-vmla.ll‎
Lines changed: 9 additions & 9 deletions
@@ -1043,21 +1043,6 @@ void VPlan::execute(VPTransformState *State) {
     if (isa<VPWidenPHIRecipe>(&R))
       continue;
 
-    if (auto *WidenPhi = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
-      assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
-             "recipe generating only scalars should have been replaced");
-      auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
-      PHINode *Phi = cast<PHINode>(GEP->getPointerOperand());
-
-      Phi->setIncomingBlock(1, VectorLatchBB);
-
-      // Move the last step to the end of the latch block. This ensures
-      // consistent placement of all induction updates.
-      Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
-      continue;
-    }
-
     auto *PhiR = cast<VPSingleDefRecipe>(&R);
     // VPInstructions currently model scalar Phis only.
     bool NeedsScalar = isa<VPInstruction>(PhiR) ||
 
@@ -2066,8 +2066,7 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
   }
 };
 
-class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
-                                      public VPUnrollPartAccessor<4> {
+class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe {
   bool IsScalarAfterVectorization;
 
 public:
@@ -2095,19 +2094,14 @@ class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
 
   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
 
-  /// Generate vector values for the pointer induction.
-  void execute(VPTransformState &State) override;
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("cannot execute this recipe, should be expanded via "
+                     "expandVPWidenIntOrFpInductionRecipe");
+  };
 
   /// Returns true if only scalar values will be generated.
   bool onlyScalarsGenerated(bool IsScalable);
 
-  /// Returns the VPValue representing the value of this induction at
-  /// the first unrolled part, if it exists. Returns itself if unrolling did not
-  /// take place.
-  VPValue *getFirstUnrolledPartOperand() {
-    return getUnrollPart(*this) == 0 ? this : getOperand(3);
-  }
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
 
@@ -3686,87 +3686,6 @@ bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
          (!IsScalable || vputils::onlyFirstLaneUsed(this));
 }
 
-void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
-  assert(getInductionDescriptor().getKind() ==
-             InductionDescriptor::IK_PtrInduction &&
-         "Not a pointer induction according to InductionDescriptor!");
-  assert(State.TypeAnalysis.inferScalarType(this)->isPointerTy() &&
-         "Unexpected type.");
-  assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
-         "Recipe should have been replaced");
-
-  unsigned CurrentPart = getUnrollPart(*this);
-
-  // Build a pointer phi
-  Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
-  Type *ScStValueType = ScalarStartValue->getType();
-
-  BasicBlock *VectorPH =
-      State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
-  PHINode *NewPointerPhi = nullptr;
-  if (CurrentPart == 0) {
-    IRBuilder<>::InsertPointGuard Guard(State.Builder);
-    if (State.Builder.GetInsertPoint() !=
-        State.Builder.GetInsertBlock()->getFirstNonPHIIt())
-      State.Builder.SetInsertPoint(
-          State.Builder.GetInsertBlock()->getFirstNonPHIIt());
-    NewPointerPhi = State.Builder.CreatePHI(ScStValueType, 2, "pointer.phi");
-    NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
-    NewPointerPhi->setDebugLoc(getDebugLoc());
-  } else {
-    // The recipe has been unrolled. In that case, fetch the single pointer phi
-    // shared among all unrolled parts of the recipe.
-    auto *GEP =
-        cast<GetElementPtrInst>(State.get(getFirstUnrolledPartOperand()));
-    NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
-  }
-
-  // A pointer induction, performed by using a gep
-  BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
-  Value *ScalarStepValue = State.get(getStepValue(), VPLane(0));
-  Type *PhiType = State.TypeAnalysis.inferScalarType(getStepValue());
-  Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
-  // Add induction update using an incorrect block temporarily. The phi node
-  // will be fixed after VPlan execution. Note that at this point the latch
-  // block cannot be used, as it does not exist yet.
-  // TODO: Model increment value in VPlan, by turning the recipe into a
-  // multi-def and a subclass of VPHeaderPHIRecipe.
-  if (CurrentPart == 0) {
-    // The recipe represents the first part of the pointer induction. Create the
-    // GEP to increment the phi across all unrolled parts.
-    Value *NumUnrolledElems = State.get(getOperand(2), true);
-
-    Value *InductionGEP = GetElementPtrInst::Create(
-        State.Builder.getInt8Ty(), NewPointerPhi,
-        State.Builder.CreateMul(
-            ScalarStepValue,
-            State.Builder.CreateTrunc(NumUnrolledElems, PhiType)),
-        "ptr.ind", InductionLoc);
-
-    NewPointerPhi->addIncoming(InductionGEP, VectorPH);
-  }
-
-  // Create actual address geps that use the pointer phi as base and a
-  // vectorized version of the step value (<step*0, ..., step*N>) as offset.
-  Type *VecPhiType = VectorType::get(PhiType, State.VF);
-  Value *StartOffsetScalar = State.Builder.CreateMul(
-      RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
-  Value *StartOffset =
-      State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
-  // Create a vector of consecutive numbers from zero to VF.
-  StartOffset = State.Builder.CreateAdd(
-      StartOffset, State.Builder.CreateStepVector(VecPhiType));
-
-  assert(ScalarStepValue == State.get(getOperand(1), VPLane(0)) &&
-         "scalar step must be the same across all parts");
-  Value *GEP = State.Builder.CreateGEP(
-      State.Builder.getInt8Ty(), NewPointerPhi,
-      State.Builder.CreateMul(StartOffset, State.Builder.CreateVectorSplat(
-                                               State.VF, ScalarStepValue)),
-      "vector.gep");
-  State.set(this, GEP);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
@@ -3925,11 +3844,6 @@ void VPWidenPHIRecipe::execute(VPTransformState &State) {
   Value *Op0 = State.get(getOperand(0));
   Type *VecTy = Op0->getType();
   Instruction *VecPhi = State.Builder.CreatePHI(VecTy, 2, Name);
-  // Manually move it with the other PHIs in case PHI recipes above this one
-  // also inserted non-phi instructions.
-  // TODO: Remove once VPWidenPointerInductionRecipe is also expanded in
-  // convertToConcreteRecipes.
-  VecPhi->moveBefore(State.Builder.GetInsertBlock()->getFirstNonPHIIt());
   State.set(this, VecPhi);
 }
 
 
@@ -2675,6 +2675,107 @@ expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR,
   WidenIVR->replaceAllUsesWith(WidePHI);
 }
 
+/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
+/// initial value, phi and backedge value. In the following example:
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
+///      ...
+///      EMIT branch-on-count ...
+///  }
+///
+/// WIDEN-POINTER-INDUCTION will get expanded to:
+///
+///  <x1> vector loop: {
+///    vector.body:
+///      EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
+///      EMIT %mul = mul %stepvector, %step
+///      EMIT %vector.gep = ptradd %pointer.phi, %add
+///      ...
+///      EMIT %ptr.ind = ptradd %pointer.phi, %vf
+///      EMIT branch-on-count ...
+///  }
+static void
+expandVPWidenPointerInductionRecipe(VPWidenPointerInductionRecipe *R,
+                                    VPTypeAnalysis &TypeInfo) {
+  VPlan *Plan = R->getParent()->getPlan();
+
+  assert(R->getInductionDescriptor().getKind() ==
+             InductionDescriptor::IK_PtrInduction &&
+         "Not a pointer induction according to InductionDescriptor!");
+  assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
+  assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
+         "Recipe should have been replaced");
+
+  unsigned CurrentPart = 0;
+  if (R->getNumOperands() > 3)
+    CurrentPart =
+        cast<ConstantInt>(R->getOperand(4)->getLiveInIRValue())->getZExtValue();
+
+  VPBuilder Builder(R);
+  DebugLoc DL = R->getDebugLoc();
+
+  // Build a pointer phi
+  VPPhi *Phi;
+  if (CurrentPart == 0) {
+    Phi = Builder.createScalarPhi({R->getStartValue()}, R->getDebugLoc(),
+                                  "pointer.phi");
+  } else {
+    // The recipe has been unrolled. In that case, fetch the single pointer phi
+    // shared among all unrolled parts of the recipe.
+    auto *PtrAdd = cast<VPInstruction>(R->getOperand(3));
+    Phi = cast<VPPhi>(PtrAdd->getOperand(0)->getDefiningRecipe());
+  }
+
+  Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
+
+  // A pointer induction, performed by using a gep
+  Type *PhiType = TypeInfo.inferScalarType(R->getStepValue());
+  VPValue *RuntimeVF = Builder.createScalarZExtOrTrunc(
+      &Plan->getVF(), PhiType, TypeInfo.inferScalarType(&Plan->getVF()), DL);
+  if (CurrentPart == 0) {
+    // The recipe represents the first part of the pointer induction. Create the
+    // GEP to increment the phi across all unrolled parts.
+    VPValue *NumUnrolledElems = Builder.createScalarZExtOrTrunc(
+        R->getOperand(2), PhiType, TypeInfo.inferScalarType(R->getOperand(2)),
+        DL);
+    VPValue *Offset = Builder.createNaryOp(
+        Instruction::Mul, {R->getStepValue(), NumUnrolledElems});
+
+    VPBuilder::InsertPointGuard Guard(Builder);
+    VPBasicBlock *ExitingBB =
+        Plan->getVectorLoopRegion()->getExitingBasicBlock();
+    Builder.setInsertPoint(ExitingBB,
+                           ExitingBB->getTerminator()->getIterator());
+
+    VPValue *InductionGEP = Builder.createPtrAdd(Phi, Offset, DL, "ptr.ind");
+    Phi->addOperand(InductionGEP);
+  }
+
+  VPValue *CurrentPartV =
+      Plan->getOrAddLiveIn(ConstantInt::get(PhiType, CurrentPart));
+
+  // Create actual address geps that use the pointer phi as base and a
+  // vectorized version of the step value (<step*0, ..., step*N>) as offset.
+  VPValue *StartOffsetScalar =
+      Builder.createNaryOp(Instruction::Mul, {RuntimeVF, CurrentPartV});
+  VPValue *StartOffset =
+      Builder.createNaryOp(VPInstruction::Broadcast, StartOffsetScalar);
+  // Create a vector of consecutive numbers from zero to VF.
+  StartOffset = Builder.createNaryOp(
+      Instruction::Add,
+      {StartOffset,
+       Builder.createNaryOp(VPInstruction::StepVector, {}, PhiType)});
+
+  VPValue *PtrAdd = Builder.createPtrAdd(
+      Phi,
+      Builder.createNaryOp(Instruction::Mul, {StartOffset, R->getStepValue()}),
+      DL, "vector.gep");
+
+  R->replaceAllUsesWith(PtrAdd);
+}
+
 void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) {
   // Replace loop regions with explicity CFG.
   SmallVector<VPRegionBlock *> LoopRegions;
@@ -2711,6 +2812,12 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
         continue;
       }
 
+      if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+        expandVPWidenPointerInductionRecipe(WidenIVR, TypeInfo);
+        ToRemove.push_back(WidenIVR);
+        continue;
+      }
+
       if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
         Expr->decompose();
         ToRemove.push_back(Expr);
 
@@ -67,10 +67,8 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 1, [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
@@ -159,17 +157,16 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 1, [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP6]], 0
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 2 x i64> [[TMP13]], splat (i64 1)
-; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i64> [[DOTSPLAT]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 2 x i64> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
 
@@ -29,14 +29,14 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[IN]], %[[VECTOR_PH]] ], [ [[PTR_IND:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI2:%.*]] = phi ptr [ [[OUT]], %[[VECTOR_PH]] ], [ [[PTR_IND3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI2:%.*]] = phi ptr [ [[IN]], %[[VECTOR_PH]] ], [ [[PTR_IND3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[OUT]], %[[VECTOR_PH]] ], [ [[PTR_IND6:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[VECTOR_GEP4:%.*]] = getelementptr i8, ptr [[POINTER_PHI2]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 [[N]])
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP1]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i8> poison), !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_MASKED_GATHER]] to <4 x i32>
@@ -66,14 +66,14 @@ define void @fn(i32 noundef %n, ptr %in, ptr %out) #0 {
 ; CHECK-NEXT:    [[TMP27:%.*]] = add nuw <4 x i32> [[TMP26]], [[TMP24]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], splat (i32 16)
 ; CHECK-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i8>
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP4]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP4]], i32 2
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP13]], <4 x ptr> [[VECTOR_GEP]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw i8, <4 x ptr> [[VECTOR_GEP]], i32 2
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP21]], <4 x ptr> [[TMP30]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP29]], <4 x ptr> [[TMP31]], i32 1, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
 ; CHECK-NEXT:    [[PTR_IND3]] = getelementptr i8, ptr [[POINTER_PHI2]], i32 12
+; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
 ; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]: