[LV] Choose best reduction for VPlan

sdesmalen-arm · sdesmalen-arm · commit 92e7821bbc41 · 2025-11-03T09:01:21.000Z
The way partial reductions currently work is as follows: * Reductions are analysed if they are suitable partial reductions, and if so a VPlan is constructed with partial reductions. * When creating VPExpressions, the LV tries to see if it's beneficial to bundle the operation into a VPExpression. If the cost of a partial reduction is too high, then the answer is 'no' and it will remain unbundled. This means the LV may end up calculating too high a cost for a partial reduction VPlan, because it still includes the cost of the extends. * When the cost of a VPlan with partial reductions is higher than the plan of a VPlan without partial reductions, it will favour the plan without partial reductions. But this is often a plan with a lower VF, because partial reductions get the extends for free (and to do this for a full vector, it would need a higher VF). * This means that if the cost of a partial reduction is too high, it will pick a lower VF, rather than trying to fall back onto a regular reduction (possibly with the same VF). This PR is a workaround and not the full solution, but there are so many things to unpick with partial reductions, that I think this is a good intermediary step before changing how we create partial reduction vplans. The better solution would be to wait with the decision on which style of reduction to choose, based on the cost of the VPExpressions which also do the analysis to see what kind of expression it is, and whether the extends can be folded into the operation. This aims to address the issue reported in #165226
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   if (CostKind != TTI::TCK_RecipThroughput)
     return Invalid;
 
+  unsigned Ratio =
+      AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
+
+  // A ratio of 1 would mean it's similar to a regular add, e.g.
+  // v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
+  // <=> add v4i64 %acc, %vec
+  if (Ratio == 1) {
+    auto *T = VectorType::get(AccumType, VF);
+    return getArithmeticInstrCost(Opcode, T, CostKind) +
+           (BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
+  }
+
   if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
       (!ST->isNeonAvailable() || !ST->hasDotProd()))
     return Invalid;
@@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
   if (IsUSDot && !ST->hasMatMulInt8())
     return Invalid;
 
-  unsigned Ratio =
-      AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
   if (VF.getKnownMinValue() <= Ratio)
     return Invalid;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// Get the factor that the VF of this recipe's output should be scaled by.
   unsigned getVFScaleFactor() const { return VFScaleFactor; }
 
+  void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -40,6 +40,8 @@
 using namespace llvm;
 using namespace VPlanPatternMatch;
 
+#define DEBUG_TYPE "loop-vectorize"
+
 static cl::opt<bool> EnableWideActiveLaneMask(
     "enable-wide-lane-mask", cl::init(false), cl::Hidden,
     cl::desc("Enable use of wide get active lane mask instructions"));
@@ -3761,7 +3763,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
 /// This function tries to create abstract recipes from the reduction recipe for
 /// following optimizations and cost estimation.
-static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
+static bool tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
                                                VPCostContext &Ctx,
                                                VFRange &Range) {
   VPExpressionRecipe *AbstractR = nullptr;
@@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
     AbstractR = ExtRed;
   // Cannot create abstract inloop reduction recipes.
   if (!AbstractR)
-    return;
+    return false;
 
   AbstractR->insertBefore(*VPBB, IP);
   Red->replaceAllUsesWith(AbstractR);
+  return true;
+}
+
+/// Lower a partial reduction back to a regular reduction, by
+/// changing the in-loop partial reduction to a binop and removing
+/// the scale factor from the PHI node.
+static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
+                                  VPCostContext &Ctx) {
+  VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
+  if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Acc)) {
+    PhiR->setVFScaleFactor(1);
+
+    // We also need to update the scale factor of the reduction-start-vector
+    // operand.
+    VPValue *StartV, *IdentityV;
+    if (!match(PhiR->getOperand(0),
+               m_VPInstruction<VPInstruction::ReductionStartVector>(
+                   m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue())))
+      llvm_unreachable("Unexpected operand for a partial reduction");
+    Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+    auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
+    cast<VPInstruction>(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
+  }
+
+  if (auto *R = dyn_cast<VPPartialReductionRecipe>(Acc))
+    if (R->getVFScaleFactor() != 1)
+      lowerPartialReduction(Plan, R, Ctx);
+
+  LLVM_DEBUG(
+      dbgs() << "LV: Lowering " << *Red
+             << " back to regular reduction, because it is not profitable\n");
+
+  // Lower the partial reduction to a regular binop.
+  VPBuilder Builder(Red);
+  VPInstruction *Add = Builder.createNaryOp(
+      RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
+      {Red->getChainOp(), Red->getVecOp()});
+  if (Red->isConditional())
+    Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
+
+  Red->replaceAllUsesWith(Add);
+  Red->eraseFromParent();
 }
 
 void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
                                                VFRange &Range) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-      if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
-        tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
+      auto *Red = dyn_cast<VPReductionRecipe>(&R);
+      if (!Red)
+        continue;
+
+      if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
+          isa<VPPartialReductionRecipe>(Red)) {
+        // If there isn't a profitable VPExpression for a partial reduction,
+        // then that suggests using a partial reduction is not profitable
+        // for this VPlan. It seems better to resort to a regular (middle-block)
+        // reduction, so that the this plan is still profitable to consider.
+        // Otherwise, the plan might be discarded in favour of a smaller VF.
+        //
+        // FIXME: There's a lot to unpick when it comes to partial
+        // reductions, but this should provide a temporary stop-gap until we
+        // reimplement the logic for creating partial reductions.
+        lowerPartialReduction(Plan, cast<VPPartialReductionRecipe>(Red), Ctx);
+      }
     }
   }
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
 ; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
 ; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-lower-back-to-reguar-reduce.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mcpu=grace -passes=loop-vectorize -mtriple=aarch64 < %s | FileCheck %s
+target triple = "aarch64"
+
+; Check that a partial reduction is reverted back to a regular reduction,
+; so that we compare "the VPlan with the best kind of reduction for <range>"
+; vs "the VPlan with the best kind of reduction for <other range>",
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable vscale_range(1,16)
+define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local i64 @foo(
+; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
+; CHECK:       [[ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 12
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
+; CHECK-NEXT:    [[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX9]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB25:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[N_VEC11:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi <4 x i64> [ [[TMP19]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX12]]
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <4 x i32> [[WIDE_LOAD14]] to <4 x i64>
+; CHECK-NEXT:    [[TMP22]] = add <4 x i64> [[VEC_PHI13]], [[TMP21]]
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP22]])
+; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC11]]
+; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[BB25]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[BB29:.*]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi i64 [ [[TMP35:%.*]], %[[BB29]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label %[[BB27]]
+; CHECK:       [[BB27]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP26]], %[[BB25]] ]
+; CHECK-NEXT:    ret i64 [[TMP28]]
+; CHECK:       [[BB29]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP36:%.*]], %[[BB29]] ]
+; CHECK-NEXT:    [[TMP31:%.*]] = phi i64 [ [[BC_MERGE_RDX17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP35]], %[[BB29]] ]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
+; CHECK-NEXT:    [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
+; CHECK-NEXT:    [[TMP35]] = add i64 [[TMP31]], [[TMP34]]
+; CHECK-NEXT:    [[TMP36]] = add nuw nsw i64 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[TMP36]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP37]], label %[[BB25]], label %[[BB29]], !llvm.loop [[LOOP5:![0-9]+]]
+;
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %4, label %8
+
+4:                                                ; preds = %2
+  %5 = zext nneg i32 %1 to i64
+  br label %10
+
+6:                                                ; preds = %10
+  %7 = phi i64 [ %16, %10 ]
+  br label %8
+
+8:                                                ; preds = %6, %2
+  %9 = phi i64 [ 0, %2 ], [ %7, %6 ]
+  ret i64 %9
+
+10:                                               ; preds = %4, %10
+  %11 = phi i64 [ 0, %4 ], [ %17, %10 ]
+  %12 = phi i64 [ 0, %4 ], [ %16, %10 ]
+  %13 = getelementptr inbounds nuw i32, ptr %0, i64 %11
+  %14 = load i32, ptr %13, align 4
+  %15 = sext i32 %14 to i64
+  %16 = add i64 %12, %15
+  %17 = add nuw nsw i64 %11, 1
+  %18 = icmp eq i64 %17, %5
+  br i1 %18, label %6, label %10
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll