Skip to content

Commit df8da2f

Browse files
authored
[VPlan] Support VPWidenPointerInductionRecipes with EVL tail folding (#152110)
Now that VPWidenPointerInductionRecipes are modelled in VPlan in #148274, we can support them in EVL tail folding. We need to replace their VFxUF operand with EVL as the increment is not guaranteed to always be VF on the penultimate iteration, and UF is always 1 with EVL tail folding. We also need to move the creation of the backedge value to the latch so that EVL dominates it. With this we will no longer fail to convert a VPlan to EVL tail folding, so adjust tryAddExplicitVectorLength to account for this. This brings us to 99.4% of all vector loops vectorized on SPEC CPU 2017 with tail folding vs no tail folding. The test in only-compute-cost-for-vplan-vfs.ll previously relied on widened pointer inductions with EVL tail folding to end up in a scenario with no vector VPlans, so this also replaces it with an unvectorizable fixed-order recurrence test from first-order-recurrence-multiply-recurrences.ll that also gets discarded.
1 parent a196281 commit df8da2f

File tree

11 files changed

+108
-62
lines changed

11 files changed

+108
-62
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8451,11 +8451,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
84518451
*Plan, CM.getMinimalBitwidths());
84528452
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
84538453
// TODO: try to put it close to addActiveLaneMask().
8454-
// Discard the plan if it is not EVL-compatible
8455-
if (CM.foldTailWithEVL() && !HasScalarVF &&
8456-
!VPlanTransforms::runPass(VPlanTransforms::tryAddExplicitVectorLength,
8457-
*Plan, CM.getMaxSafeElements()))
8458-
break;
8454+
if (CM.foldTailWithEVL() && !HasScalarVF)
8455+
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
8456+
*Plan, CM.getMaxSafeElements());
84598457
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
84608458
VPlans.push_back(std::move(Plan));
84618459
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2185,6 +2185,21 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
21852185
"User of VF that we can't transform to EVL.");
21862186
Plan.getVF().replaceAllUsesWith(&EVL);
21872187

2188+
assert(all_of(Plan.getVFxUF().users(),
2189+
[&Plan](VPUser *U) {
2190+
return match(U, m_c_Binary<Instruction::Add>(
2191+
m_Specific(Plan.getCanonicalIV()),
2192+
m_Specific(&Plan.getVFxUF()))) ||
2193+
isa<VPWidenPointerInductionRecipe>(U);
2194+
}) &&
2195+
"Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
2196+
"increment of the canonical induction.");
2197+
Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2198+
// Only replace uses in VPWidenPointerInductionRecipe; The increment of the
2199+
// canonical induction must not be updated.
2200+
return isa<VPWidenPointerInductionRecipe>(U);
2201+
});
2202+
21882203
// Defer erasing recipes till the end so that we don't invalidate the
21892204
// VPTypeAnalysis cache.
21902205
SmallVector<VPRecipeBase *> ToErase;
@@ -2320,16 +2335,9 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
23202335
/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
23212336
/// ...
23222337
///
2323-
bool VPlanTransforms::tryAddExplicitVectorLength(
2338+
void VPlanTransforms::addExplicitVectorLength(
23242339
VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
23252340
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
2326-
// The transform updates all users of inductions to work based on EVL, instead
2327-
// of the VF directly. At the moment, widened pointer inductions cannot be
2328-
// updated, so bail out if the plan contains any.
2329-
bool ContainsWidenPointerInductions =
2330-
any_of(Header->phis(), IsaPred<VPWidenPointerInductionRecipe>);
2331-
if (ContainsWidenPointerInductions)
2332-
return false;
23332341

23342342
auto *CanonicalIVPHI = Plan.getCanonicalIV();
23352343
auto *CanIVTy = CanonicalIVPHI->getScalarType();
@@ -2384,7 +2392,6 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
23842392
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
23852393
// TODO: support unroll factor > 1.
23862394
Plan.setUF(1);
2387-
return true;
23882395
}
23892396

23902397
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
@@ -2808,13 +2815,12 @@ static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R,
28082815
R->replaceAllUsesWith(PtrAdd);
28092816

28102817
// Create the backedge value for the scalar pointer phi.
2811-
Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
2818+
VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2819+
Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
28122820
VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
28132821
DL);
28142822
VPValue *Inc = Builder.createNaryOp(Instruction::Mul, {Step, VF});
28152823

2816-
VPBasicBlock *ExitingBB = Plan->getVectorLoopRegion()->getExitingBasicBlock();
2817-
Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
28182824
VPValue *InductionGEP =
28192825
Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
28202826
ScalarPtrPhi->addOperand(InductionGEP);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,9 @@ struct VPlanTransforms {
177177
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
178178
/// VPCanonicalIVPHIRecipe is only used to control the loop after
179179
/// this transformation.
180-
/// \returns true if the transformation succeeds, or false if it doesn't.
181-
static bool
182-
tryAddExplicitVectorLength(VPlan &Plan,
183-
const std::optional<unsigned> &MaxEVLSafeElements);
180+
static void
181+
addExplicitVectorLength(VPlan &Plan,
182+
const std::optional<unsigned> &MaxEVLSafeElements);
184183

185184
// For each Interleave Group in \p InterleaveGroups replace the Recipes
186185
// widening its memory instructions with a single VPInterleaveRecipe at its

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
157157
return VerifyEVLUse(*S, S->getNumOperands() - 1);
158158
})
159159
.Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
160-
VPWidenIntOrFpInductionRecipe>(
160+
VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
161161
[&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
162162
.Case<VPScalarIVStepsRecipe>([&](auto *R) {
163163
if (R->getNumOperands() != 3) {

llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
3535
; CHECK: vector.body:
3636
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
3737
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
38-
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]]
3938
; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
4039
; CHECK-NEXT: [[TMP15:%.*]] = mul <vscale x 2 x i64> [[TMP13]], splat (i64 1)
4140
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP15]]
@@ -48,6 +47,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias
4847
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
4948
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP20]], ptr [[TMP18]], align 1
5049
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
50+
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP6]]
5151
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]]
5252
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
5353
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -119,7 +119,6 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
119119
; CHECK: vector.body:
120120
; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
121121
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
122-
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]]
123122
; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
124123
; CHECK-NEXT: [[TMP14:%.*]] = mul <vscale x 2 x i64> [[TMP12]], splat (i64 1)
125124
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP14]]
@@ -128,6 +127,7 @@ define void @pointer_induction(ptr noalias %start, i64 %N) {
128127
; CHECK-NEXT: [[TMP17:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], splat (i8 1)
129128
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP17]], ptr [[TMP15]], align 1
130129
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP6]]
130+
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP6]]
131131
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]]
132132
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
133133
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]

llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,6 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
239239
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
240240
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
241241
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
242-
; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3
243242
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
244243
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 2 x i64> [[TMP9]], splat (i64 2)
245244
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP10]]
@@ -250,6 +249,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
250249
; CHECK-NEXT: [[TMP12]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
251250
; CHECK-NEXT: store <vscale x 2 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8
252251
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
252+
; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP5]], 3
253253
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]]
254254
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
255255
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -313,14 +313,14 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
313313
; CHECK: vector.body:
314314
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
315315
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
316-
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2
317316
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
318317
; CHECK-NEXT: [[TMP5:%.*]] = shl <vscale x 2 x i64> [[TMP4]], splat (i64 1)
319318
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP5]]
320319
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], zeroinitializer
321320
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <vscale x 2 x ptr> [[VECTOR_GEP]], i64 0
322321
; CHECK-NEXT: call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
323322
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
323+
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP0]], 2
324324
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
325325
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
326326
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,16 +74,50 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) {
7474
; CHECK-LABEL: define void @test_wide_ptr_induction(
7575
; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
7676
; CHECK-NEXT: entry:
77+
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
78+
; CHECK: vector.ph:
79+
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
80+
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
81+
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
82+
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
83+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
84+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
85+
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
86+
; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2
7787
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
88+
; CHECK: vector.body:
89+
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
90+
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
91+
; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ]
92+
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
93+
; CHECK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], splat (i64 8)
94+
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP6]]
95+
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
96+
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
97+
; CHECK-NEXT: call void @llvm.vp.store.nxv2p0.p0(<vscale x 2 x ptr> [[VECTOR_GEP]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP7]])
98+
; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64
99+
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]]
100+
; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]]
101+
; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP7]] to i64
102+
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 8, [[TMP10]]
103+
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]]
104+
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
105+
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
106+
; CHECK: middle.block:
107+
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
108+
; CHECK: scalar.ph:
109+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ]
110+
; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[B]], [[ENTRY]] ]
111+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
78112
; CHECK: for.body:
79-
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
80-
; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[VECTOR_BODY]] ], [ [[B]], [[VECTOR_PH]] ]
113+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
114+
; CHECK-NEXT: [[ADDR:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
81115
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[ADDR]], i64 8
82-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]]
116+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
83117
; CHECK-NEXT: store ptr [[ADDR]], ptr [[ARRAYIDX]], align 8
84-
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw nsw i64 [[EVL_BASED_IV]], 1
85-
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
86-
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
118+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
119+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
120+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
87121
; CHECK: for.cond.cleanup:
88122
; CHECK-NEXT: ret void
89123
;
@@ -109,4 +143,6 @@ for.cond.cleanup:
109143
; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"}
110144
; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
111145
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]], [[META1]]}
146+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]}
147+
; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]], [[META1]]}
112148
;.
Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,36 @@
1-
; RUN: opt -passes=loop-vectorize \
2-
; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
3-
; RUN: -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s
1+
; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -S -debug %s 2>&1 | FileCheck %s
42

53
; REQUIRES: asserts
64

7-
; Make sure we do not vectorize a loop with a widened pointer induction.
8-
define void @test_wide_pointer_induction(ptr noalias %a, i64 %N) {
5+
; For %for.1, we are fine initially, because the previous value %for.1.next dominates the
6+
; user of %for.1. But for %for.2, we have to sink the user (%for.1.next) past the previous
7+
; value %for.2.next. This however breaks the condition we have for %for.1. We cannot fix
8+
; both first order recurrences and cannot vectorize the loop.
9+
;
10+
; Make sure we don't compute costs if there are no vector VPlans.
11+
912
; CHECK-NOT: LV: Vector loop of width {{.+}} costs:
1013
;
11-
; CHECK: define void @test_wide_pointer_induction(
14+
; CHECK: define i32 @test(
1215
; CHECK-NOT: vector.body
1316
;
17+
define i32 @test(i32 %N) {
1418
entry:
15-
br label %loop
19+
br label %for.body
1620

17-
loop:
18-
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
19-
%iv.ptr = phi ptr [ %a, %entry ], [ %iv.ptr.next, %loop ]
20-
%arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
21-
store ptr %iv.ptr, ptr %arrayidx, align 8
22-
%iv.next = add nuw nsw i64 %iv, 1
23-
%iv.ptr.next = getelementptr i64, ptr %iv.ptr, i32 1
24-
%exitcond.not = icmp eq i64 %iv.next, %N
25-
br i1 %exitcond.not, label %exit, label %loop
21+
for.body: ; preds = %for.body.preheader, %for.body
22+
%iv = phi i32 [ %inc, %for.body ], [ 10, %entry ]
23+
%for.1 = phi i32 [ %for.1.next, %for.body ], [ 20, %entry ]
24+
%for.2 = phi i32 [ %for.2.next, %for.body ], [ 11, %entry ]
25+
%for.1.next = add nsw i32 %for.2, 1
26+
%for.2.next = shl i32 %for.1, 24
27+
%inc = add nsw i32 %iv, 1
28+
%exitcond = icmp eq i32 %inc, %N
29+
br i1 %exitcond, label %for.cond1.for.end_crit_edge, label %for.body
2630

27-
exit:
28-
ret void
31+
for.cond1.for.end_crit_edge: ; preds = %for.body
32+
%add.lcssa = phi i32 [ %for.1.next, %for.body ]
33+
%sext.lcssa = phi i32 [ %for.2.next, %for.body ]
34+
%res = add i32 %add.lcssa, %sext.lcssa
35+
ret i32 %res
2936
}

0 commit comments

Comments
 (0)