llvm · sdesmalen-arm · Nov 2, 2025 · sushgokh · Nov 4, 2025 · sushgokh
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;
 
+  unsigned Ratio =
+      AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+
+  // A ratio of 1 would mean it's similar to a regular add, e.g.
+  // v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
+  // <=> add v4i64 %acc, %vec
+  if (Ratio == 1) {
+    auto *T = VectorType::get(AccumType, VF);
+    return getArithmeticInstrCost(Opcode, T, CostKind) +
+           (BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
+  }
+
   if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
       (!ST->isNeonAvailable() || !ST->hasDotProd()))
     return Invalid;
@@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   if (IsUSDot && !ST->hasMatMulInt8())
     return Invalid;
 
-  unsigned Ratio =
-      AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
   if (VF.getKnownMinValue() <= Ratio)
     return Invalid;
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
 
+  void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,6 +40,8 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
+#define DEBUG_TYPE "loop-vectorize"
+
 static cl::opt<bool> EnableWideActiveLaneMask(
     "enable-wide-lane-mask", cl::init(false), cl::Hidden,
     cl::desc("Enable use of wide get active lane mask instructions"));
@@ -3761,7 +3763,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
 /// This function tries to create abstract recipes from the reduction recipe for
 /// following optimizations and cost estimation.
-static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
+static bool tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
                                                VPCostContext &Ctx,
                                                VFRange &Range) {
   VPExpressionRecipe *AbstractR = nullptr;
@@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
     AbstractR = ExtRed;
   // Cannot create abstract inloop reduction recipes.
   if (!AbstractR)
-    return;
+    return false;
 
   AbstractR->insertBefore(*VPBB, IP);
   Red->replaceAllUsesWith(AbstractR);
+  return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+                                  VPCostContext &Ctx) {
+  VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+  if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Acc)) {
+    PhiR->setVFScaleFactor(1);
+
+    // We also need to update the scale factor of the reduction-start-vector
+    // operand.
+    VPValue *StartV, *IdentityV;
+    if (!match(PhiR->getOperand(0),
+               m_VPInstruction<VPInstruction::ReductionStartVector>(
+                   m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue())))
+      llvm_unreachable("Unexpected operand for a partial reduction");
+    Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+    auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+    cast<VPInstruction>(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+  }
+
+  if (auto *R = dyn_cast<VPPartialReductionRecipe>(Acc))
+    if (R->getVFScaleFactor() != 1)
+      lowerPartialReduction(Plan, R, Ctx);
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Lowering " << *Red
+             << " back to regular reduction, because it is not profitable\n");
+
+  // Lower the partial reduction to a regular binop.
+  VPBuilder Builder(Red);
+  VPInstruction *Add = Builder.createNaryOp(
+      RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+      {Red->getChainOp(), Red->getVecOp()});
+  if (Red->isConditional())
+    Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+  Red->replaceAllUsesWith(Add);
+  Red->eraseFromParent();
 }
 
 void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                                VFRange &Range) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
-        tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+      auto *Red = dyn_cast<VPReductionRecipe>(&R);
+      if (!Red)
+        continue;
+
+      if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
+          isa<VPPartialReductionRecipe>(Red)) {
+        // If there isn't a profitable VPExpression for a partial reduction,
+        // then that suggests using a partial reduction is not profitable
+        // for this VPlan. It seems better to resort to a regular (middle-block)
+        // reduction, so that the this plan is still profitable to consider.
+        // Otherwise, the plan might be discarded in favour of a smaller VF.
+        //
+        // FIXME: There's a lot to unpick when it comes to partial
+        // reductions, but this should provide a temporary stop-gap until we
+        // reimplement the logic for creating partial reductions.
+        lowerPartialReduction(Plan, cast<VPPartialReductionRecipe>(Red), Ctx);
+      }
     }
   }
 }

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
 ; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
 ;
 entry:

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=neoverse-v2 -passes=loop-vectorize -mtriple=aarch64 < %s | FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for <range>"
+; vs "the VPlan with the best kind of reduction for <other range>",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK:       [[ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:    [[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX9]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB25:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[N_VEC11:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi <4 x i64> [ [[TMP19]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX12]]
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <4 x i32> [[WIDE_LOAD14]] to <4 x i64>
+; CHECK-NEXT:    [[TMP22]] = add <4 x i64> [[VEC_PHI13]], [[TMP21]]
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP22]])
+; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[BB25]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[BB29:.*]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi i64 [ [[TMP35:%.*]], %[[BB29]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[BB27]]
+; CHECK:       [[BB27]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP26]], %[[BB25]] ]
+; CHECK-NEXT:    ret i64 [[TMP28]]
+; CHECK:       [[BB29]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP36:%.*]], %[[BB29]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i64 [ [[BC_MERGE_RDX17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP35]], %[[BB29]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
+; CHECK-NEXT:    [[TMP35]] = add i64 [[TMP31]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36]] = add nuw nsw i64 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[TMP36]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[BB25]], label %[[BB29]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %4, label %8
+
+4:                                                ; preds = %2
+  %5 = zext nneg i32 %1 to i64
+  br label %10
+
+6:                                                ; preds = %10
+  %7 = phi i64 [ %16, %10 ]
+  br label %8
+
+8:                                                ; preds = %6, %2
+  %9 = phi i64 [ 0, %2 ], [ %7, %6 ]
+  ret i64 %9
+
+10:                                               ; preds = %4, %10
+  %11 = phi i64 [ 0, %4 ], [ %17, %10 ]
+  %12 = phi i64 [ 0, %4 ], [ %16, %10 ]
+  %13 = getelementptr inbounds nuw i32, ptr %0, i64 %11
+  %14 = load i32, ptr %13, align 4
+  %15 = sext i32 %14 to i64
+  %16 = add i64 %12, %15
+  %17 = add nuw nsw i64 %11, 1
+  %18 = icmp eq i64 %17, %5
+  br i1 %18, label %6, label %10
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.