Skip to content

Commit 92e7821

Browse files
committed
[LV] Choose best reduction for VPlan
The way partial reductions currently work is as follows: * Reductions are analysed if they are suitable partial reductions, and if so a VPlan is constructed with partial reductions. * When creating VPExpressions, the LV tries to see if it's beneficial to bundle the operation into a VPExpression. If the cost of a partial reduction is too high, then the answer is 'no' and it will remain unbundled. This means the LV may end up calculating too high a cost for a partial reduction VPlan, because it still includes the cost of the extends. * When the cost of a VPlan with partial reductions is higher than the plan of a VPlan without partial reductions, it will favour the plan without partial reductions. But this is often a plan with a lower VF, because partial reductions get the extends for free (and to do this for a full vector, it would need a higher VF). * This means that if the cost of a partial reduction is too high, it will pick a lower VF, rather than trying to fall back onto a regular reduction (possibly with the same VF). This PR is a workaround and not the full solution, but there are so many things to unpick with partial reductions, that I think this is a good intermediary step before changing how we create partial reduction vplans. The better solution would be to wait with the decision on which style of reduction to choose, based on the cost of the VPExpressions which also do the analysis to see what kind of expression it is, and whether the extends can be folded into the operation. This aims to address the issue reported in #165226
1 parent 69c8231 commit 92e7821

File tree

6 files changed

+241
-34
lines changed

6 files changed

+241
-34
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5679,6 +5679,18 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
56795679
if (CostKind != TTI::TCK_RecipThroughput)
56805680
return Invalid;
56815681

5682+
unsigned Ratio =
5683+
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
5684+
5685+
// A ratio of 1 would mean it's similar to a regular add, e.g.
5686+
// v4i64 partial.reduce(v4i64 %acc, v4i64 %vec)
5687+
// <=> add v4i64 %acc, %vec
5688+
if (Ratio == 1) {
5689+
auto *T = VectorType::get(AccumType, VF);
5690+
return getArithmeticInstrCost(Opcode, T, CostKind) +
5691+
(BinOp ? getArithmeticInstrCost(*BinOp, T, CostKind) : 0);
5692+
}
5693+
56825694
if (VF.isFixed() && !ST->isSVEorStreamingSVEAvailable() &&
56835695
(!ST->isNeonAvailable() || !ST->hasDotProd()))
56845696
return Invalid;
@@ -5700,8 +5712,6 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
57005712
if (IsUSDot && !ST->hasMatMulInt8())
57015713
return Invalid;
57025714

5703-
unsigned Ratio =
5704-
AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
57055715
if (VF.getKnownMinValue() <= Ratio)
57065716
return Invalid;
57075717

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2378,6 +2378,8 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
23782378
/// Get the factor that the VF of this recipe's output should be scaled by.
23792379
unsigned getVFScaleFactor() const { return VFScaleFactor; }
23802380

2381+
void setVFScaleFactor(unsigned F) { VFScaleFactor = F; }
2382+
23812383
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
23822384
/// Print the recipe.
23832385
void print(raw_ostream &O, const Twine &Indent,

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
using namespace llvm;
4141
using namespace VPlanPatternMatch;
4242

43+
#define DEBUG_TYPE "loop-vectorize"
44+
4345
static cl::opt<bool> EnableWideActiveLaneMask(
4446
"enable-wide-lane-mask", cl::init(false), cl::Hidden,
4547
cl::desc("Enable use of wide get active lane mask instructions"));
@@ -3761,7 +3763,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
37613763

37623764
/// This function tries to create abstract recipes from the reduction recipe for
37633765
/// following optimizations and cost estimation.
3764-
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
3766+
static bool tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
37653767
VPCostContext &Ctx,
37663768
VFRange &Range) {
37673769
VPExpressionRecipe *AbstractR = nullptr;
@@ -3773,19 +3775,76 @@ static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red,
37733775
AbstractR = ExtRed;
37743776
// Cannot create abstract inloop reduction recipes.
37753777
if (!AbstractR)
3776-
return;
3778+
return false;
37773779

37783780
AbstractR->insertBefore(*VPBB, IP);
37793781
Red->replaceAllUsesWith(AbstractR);
3782+
return true;
3783+
}
3784+
3785+
/// Lower a partial reduction back to a regular reduction, by
3786+
/// changing the in-loop partial reduction to a binop and removing
3787+
/// the scale factor from the PHI node.
3788+
static void lowerPartialReduction(VPlan &Plan, VPPartialReductionRecipe *Red,
3789+
VPCostContext &Ctx) {
3790+
VPRecipeBase *Acc = Red->getChainOp()->getDefiningRecipe();
3791+
if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Acc)) {
3792+
PhiR->setVFScaleFactor(1);
3793+
3794+
// We also need to update the scale factor of the reduction-start-vector
3795+
// operand.
3796+
VPValue *StartV, *IdentityV;
3797+
if (!match(PhiR->getOperand(0),
3798+
m_VPInstruction<VPInstruction::ReductionStartVector>(
3799+
m_VPValue(StartV), m_VPValue(IdentityV), m_VPValue())))
3800+
llvm_unreachable("Unexpected operand for a partial reduction");
3801+
Type *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
3802+
auto *ScaleFactorVPV = Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, 1));
3803+
cast<VPInstruction>(PhiR->getOperand(0))->setOperand(2, ScaleFactorVPV);
3804+
}
3805+
3806+
if (auto *R = dyn_cast<VPPartialReductionRecipe>(Acc))
3807+
if (R->getVFScaleFactor() != 1)
3808+
lowerPartialReduction(Plan, R, Ctx);
3809+
3810+
LLVM_DEBUG(
3811+
dbgs() << "LV: Lowering " << *Red
3812+
<< " back to regular reduction, because it is not profitable\n");
3813+
3814+
// Lower the partial reduction to a regular binop.
3815+
VPBuilder Builder(Red);
3816+
VPInstruction *Add = Builder.createNaryOp(
3817+
RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
3818+
{Red->getChainOp(), Red->getVecOp()});
3819+
if (Red->isConditional())
3820+
Add = Builder.createSelect(Red->getCondOp(), Add, Red->getChainOp());
3821+
3822+
Red->replaceAllUsesWith(Add);
3823+
Red->eraseFromParent();
37803824
}
37813825

37823826
void VPlanTransforms::convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx,
37833827
VFRange &Range) {
37843828
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
37853829
vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
37863830
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3787-
if (auto *Red = dyn_cast<VPReductionRecipe>(&R))
3788-
tryToCreateAbstractReductionRecipe(Red, Ctx, Range);
3831+
auto *Red = dyn_cast<VPReductionRecipe>(&R);
3832+
if (!Red)
3833+
continue;
3834+
3835+
if (!tryToCreateAbstractReductionRecipe(Red, Ctx, Range) &&
3836+
isa<VPPartialReductionRecipe>(Red)) {
3837+
// If there isn't a profitable VPExpression for a partial reduction,
3838+
// then that suggests using a partial reduction is not profitable
3839+
// for this VPlan. It seems better to resort to a regular (middle-block)
3840+
// reduction, so that the this plan is still profitable to consider.
3841+
// Otherwise, the plan might be discarded in favour of a smaller VF.
3842+
//
3843+
// FIXME: There's a lot to unpick when it comes to partial
3844+
// reductions, but this should provide a temporary stop-gap until we
3845+
// reimplement the logic for creating partial reductions.
3846+
lowerPartialReduction(Plan, cast<VPPartialReductionRecipe>(Red), Ctx);
3847+
}
37893848
}
37903849
}
37913850
}

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -482,29 +482,29 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
482482
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
483483
; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
484484
; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
485+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
485486
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
486487
; CHECK: [[VECTOR_BODY]]:
487488
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
488-
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
489+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
489490
; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2
490491
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
491492
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
492-
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
493-
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
493+
; CHECK-NEXT: [[TMP8]] = add <8 x i64> [[VEC_PHI]], [[TMP3]]
494494
; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
495495
; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
496496
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
497497
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
498-
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
498+
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
499499
; CHECK: [[MIDDLE_BLOCK]]:
500-
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
500+
; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
501501
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
502502
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
503503
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
504504
; CHECK: [[SCALAR_PH]]:
505505
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
506506
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
507-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
507+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
508508
; CHECK-NEXT: br label %[[LOOP:.*]]
509509
; CHECK: [[LOOP]]:
510510
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -520,9 +520,9 @@ define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
520520
; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
521521
; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
522522
; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
523-
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
523+
; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
524524
; CHECK: [[EXIT]]:
525-
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
525+
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
526526
; CHECK-NEXT: ret i64 [[ADD_LCSSA]]
527527
;
528528
entry:
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
2+
; RUN: opt -S -mcpu=grace -passes=loop-vectorize -mtriple=aarch64 < %s | FileCheck %s
3+
target triple = "aarch64"
4+
5+
; Check that a partial reduction is reverted back to a regular reduction,
6+
; so that we compare "the VPlan with the best kind of reduction for <range>"
7+
; vs "the VPlan with the best kind of reduction for <other range>",
8+
9+
; Function Attrs: nofree norecurse nosync nounwind memory(argmem: read) uwtable vscale_range(1,16)
10+
define dso_local i64 @foo(ptr noundef readonly captures(none) %0, i32 noundef %1) local_unnamed_addr #0 {
11+
; CHECK-LABEL: define dso_local i64 @foo(
12+
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
13+
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP1]], 0
14+
; CHECK-NEXT: br i1 [[TMP3]], label %[[ITER_CHECK:.*]], label %[[BB27:.*]]
15+
; CHECK: [[ITER_CHECK]]:
16+
; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i32 [[TMP1]] to i64
17+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
18+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
19+
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
20+
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP4]], 16
21+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
22+
; CHECK: [[VECTOR_PH]]:
23+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 16
24+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
25+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
26+
; CHECK: [[VECTOR_BODY]]:
27+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
28+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
29+
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
30+
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
31+
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
32+
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX]]
33+
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 4
34+
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 8
35+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 12
36+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4
37+
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP6]], align 4
38+
; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
39+
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
40+
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
41+
; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
42+
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
43+
; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i32> [[WIDE_LOAD7]] to <4 x i64>
44+
; CHECK-NEXT: [[TMP13]] = add <4 x i64> [[VEC_PHI]], [[TMP9]]
45+
; CHECK-NEXT: [[TMP14]] = add <4 x i64> [[VEC_PHI2]], [[TMP10]]
46+
; CHECK-NEXT: [[TMP15]] = add <4 x i64> [[VEC_PHI3]], [[TMP11]]
47+
; CHECK-NEXT: [[TMP16]] = add <4 x i64> [[VEC_PHI4]], [[TMP12]]
48+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
49+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
50+
; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
51+
; CHECK: [[MIDDLE_BLOCK]]:
52+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i64> [[TMP14]], [[TMP13]]
53+
; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i64> [[TMP15]], [[BIN_RDX]]
54+
; CHECK-NEXT: [[BIN_RDX9:%.*]] = add <4 x i64> [[TMP16]], [[BIN_RDX8]]
55+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[BIN_RDX9]])
56+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
57+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[BB25:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
58+
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
59+
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
60+
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
61+
; CHECK: [[VEC_EPILOG_PH]]:
62+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
63+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
64+
; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP4]], 4
65+
; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF10]]
66+
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
67+
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
68+
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
69+
; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
70+
; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi <4 x i64> [ [[TMP19]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
71+
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[INDEX12]]
72+
; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4
73+
; CHECK-NEXT: [[TMP21:%.*]] = sext <4 x i32> [[WIDE_LOAD14]] to <4 x i64>
74+
; CHECK-NEXT: [[TMP22]] = add <4 x i64> [[VEC_PHI13]], [[TMP21]]
75+
; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX12]], 4
76+
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC11]]
77+
; CHECK-NEXT: br i1 [[TMP23]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
78+
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
79+
; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP22]])
80+
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC11]]
81+
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[BB25]], label %[[VEC_EPILOG_SCALAR_PH]]
82+
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
83+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
84+
; CHECK-NEXT: [[BC_MERGE_RDX17:%.*]] = phi i64 [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
85+
; CHECK-NEXT: br label %[[BB29:.*]]
86+
; CHECK: [[BB25]]:
87+
; CHECK-NEXT: [[TMP26:%.*]] = phi i64 [ [[TMP35:%.*]], %[[BB29]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
88+
; CHECK-NEXT: br label %[[BB27]]
89+
; CHECK: [[BB27]]:
90+
; CHECK-NEXT: [[TMP28:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[TMP26]], %[[BB25]] ]
91+
; CHECK-NEXT: ret i64 [[TMP28]]
92+
; CHECK: [[BB29]]:
93+
; CHECK-NEXT: [[TMP30:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP36:%.*]], %[[BB29]] ]
94+
; CHECK-NEXT: [[TMP31:%.*]] = phi i64 [ [[BC_MERGE_RDX17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP35]], %[[BB29]] ]
95+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP0]], i64 [[TMP30]]
96+
; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4
97+
; CHECK-NEXT: [[TMP34:%.*]] = sext i32 [[TMP33]] to i64
98+
; CHECK-NEXT: [[TMP35]] = add i64 [[TMP31]], [[TMP34]]
99+
; CHECK-NEXT: [[TMP36]] = add nuw nsw i64 [[TMP30]], 1
100+
; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[TMP36]], [[TMP4]]
101+
; CHECK-NEXT: br i1 [[TMP37]], label %[[BB25]], label %[[BB29]], !llvm.loop [[LOOP5:![0-9]+]]
102+
;
103+
%3 = icmp sgt i32 %1, 0
104+
br i1 %3, label %4, label %8
105+
106+
4: ; preds = %2
107+
%5 = zext nneg i32 %1 to i64
108+
br label %10
109+
110+
6: ; preds = %10
111+
%7 = phi i64 [ %16, %10 ]
112+
br label %8
113+
114+
8: ; preds = %6, %2
115+
%9 = phi i64 [ 0, %2 ], [ %7, %6 ]
116+
ret i64 %9
117+
118+
10: ; preds = %4, %10
119+
%11 = phi i64 [ 0, %4 ], [ %17, %10 ]
120+
%12 = phi i64 [ 0, %4 ], [ %16, %10 ]
121+
%13 = getelementptr inbounds nuw i32, ptr %0, i64 %11
122+
%14 = load i32, ptr %13, align 4
123+
%15 = sext i32 %14 to i64
124+
%16 = add i64 %12, %15
125+
%17 = add nuw nsw i64 %11, 1
126+
%18 = icmp eq i64 %17, %5
127+
br i1 %18, label %6, label %10
128+
}
129+
;.
130+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
131+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
132+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
133+
; CHECK: [[PROF3]] = !{!"branch_weights", i32 4, i32 12}
134+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
135+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
136+
;.

0 commit comments

Comments
 (0)