Skip to content

Commit 5c81432

Browse files
committed
[VPlan] Convert EVL loops to variable-length stepping after dissolution
Loop regions require fixed-length steps and rounded-up trip counts, but after dissolution creates explicit control flow, EVL loops can leverage variable-length stepping with original trip counts. This patch adds a post-dissolution transform pass to convert EVL loops from fixed-length to variable-length stepping .
1 parent d7a38a9 commit 5c81432

28 files changed

+290
-464
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7286,6 +7286,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72867286
// Regions are dissolved after optimizing for VF and UF, which completely
72877287
// removes unneeded loop regions first.
72887288
VPlanTransforms::dissolveLoopRegions(BestVPlan);
7289+
// Enable variable-length stepping for EVL loops after regions are dissolved
7290+
VPlanTransforms::simplifyEVLIVs(BestVPlan);
72897291
// Perform the actual loop transformation.
72907292
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
72917293
OrigLoop->getParentLoop(),

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2365,6 +2365,58 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
23652365
return true;
23662366
}
23672367

2368+
void VPlanTransforms::simplifyEVLIVs(VPlan &Plan) {
2369+
auto ConvertEVLPhi = [](VPlan &Plan, VPBasicBlock *Entry,
2370+
VPEVLBasedIVPHIRecipe *EVLPhi) {
2371+
using namespace llvm::VPlanPatternMatch;
2372+
VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2373+
2374+
// Convert EVLPhi to concrete recipe.
2375+
auto *ScalarR = VPBuilder(EVLPhi).createScalarPhi(
2376+
{EVLPhi->getStartValue(), EVLIncrement}, EVLPhi->getDebugLoc(),
2377+
"evl.based.iv");
2378+
EVLPhi->replaceAllUsesWith(ScalarR);
2379+
EVLPhi->eraseFromParent();
2380+
2381+
// Find the latch-exiting block and convert to variable-length stepping.
2382+
// Before: (branch-on-cond CanonicalIVInc, VectorTripCount)
2383+
// After: (branch-on-cond EVLIVInc, TripCount)
2384+
auto FindLatchExiting = [](VPBasicBlock *Entry) {
2385+
auto Range =
2386+
VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(Entry));
2387+
auto It = find_if(Range, [&](VPBasicBlock *VPBB) {
2388+
return any_of(VPBB->successors(),
2389+
[&](VPBlockBase *Succ) { return Succ == Entry; });
2390+
});
2391+
return It != Range.end() ? *It : nullptr;
2392+
};
2393+
VPBasicBlock *LatchExiting = FindLatchExiting(Entry);
2394+
assert(LatchExiting && "LatchExiting is not found");
2395+
auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2396+
VPValue *ScalarIVInc;
2397+
if (!LatchExitingBr ||
2398+
!match(LatchExitingBr,
2399+
m_BranchOnCount(m_VPValue(ScalarIVInc),
2400+
m_Specific(&Plan.getVectorTripCount()))))
2401+
return;
2402+
LatchExitingBr->setOperand(1, Plan.getTripCount());
2403+
ScalarIVInc->replaceAllUsesWith(EVLIncrement);
2404+
VPRecipeBase *IVIncR = ScalarIVInc->getDefiningRecipe();
2405+
VPRecipeBase *ScalarIV = IVIncR->getOperand(0)->getDefiningRecipe();
2406+
IVIncR->eraseFromParent();
2407+
ScalarIV->eraseFromParent();
2408+
};
2409+
2410+
// Find EVL loop entries by locating VPEVLBasedIVPHIRecipe
2411+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2412+
vp_depth_first_shallow(Plan.getEntry())))
2413+
for (VPRecipeBase &R : VPBB->phis())
2414+
if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2415+
ConvertEVLPhi(Plan, VPBB, PhiR);
2416+
break;
2417+
}
2418+
}
2419+
23682420
void VPlanTransforms::dropPoisonGeneratingRecipes(
23692421
VPlan &Plan,
23702422
const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
@@ -2696,15 +2748,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
26962748
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
26972749
vp_depth_first_deep(Plan.getEntry()))) {
26982750
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2699-
if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2700-
auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2701-
{PhiR->getStartValue(), PhiR->getBackedgeValue()},
2702-
PhiR->getDebugLoc(), "evl.based.iv");
2703-
PhiR->replaceAllUsesWith(ScalarR);
2704-
ToRemove.push_back(PhiR);
2705-
continue;
2706-
}
2707-
27082751
if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
27092752
expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
27102753
ToRemove.push_back(WidenIVR);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,17 @@ struct VPlanTransforms {
209209
/// Replace loop regions with explicit CFG.
210210
static void dissolveLoopRegions(VPlan &Plan);
211211

212+
/// Transform EVL loops to use variable-length stepping after region
213+
/// dissolution.
214+
///
215+
/// Once loop regions are replaced with explicit CFG, EVL loops can step with
216+
/// variable vector lengths instead of fixed lengths. This transformation:
217+
/// * EVL-Phi concretization (makes them concrete)
218+
/// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
219+
/// VectorTripCount) with variable-length stepping (branch-on-cond
220+
/// EVLIVInc, TripCount).
221+
static void simplifyEVLIVs(VPlan &Plan);
222+
212223
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
213224
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
214225
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
192192
errs() << "EVL used by unexpected VPInstruction\n";
193193
return false;
194194
}
195-
if (I->getNumUsers() != 1) {
195+
// EVLIVIncrement is only used by EVLIV & BranchOnCount.
196+
// More than two is unexpected.
197+
if (I->getNumUsers() > 2) {
196198
errs() << "EVL is used in VPInstruction with multiple users\n";
197199
return false;
198200
}

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
2323
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
2424
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
2525
; CHECK: vector.body:
26-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2726
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
2827
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
2928
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -37,9 +36,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
3736
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
3837
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
3938
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
40-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
4139
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
42-
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
40+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
4341
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4442
; CHECK: middle.block:
4543
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
132132
; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
133133
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
134134
; IF-EVL-OUTLOOP: vector.body:
135-
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
136135
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
137136
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
138137
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -144,8 +143,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
144143
; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP9]]
145144
; IF-EVL-OUTLOOP-NEXT: [[TMP10]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP5]])
146145
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
147-
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
148-
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
146+
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
149147
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
150148
; IF-EVL-OUTLOOP: middle.block:
151149
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
@@ -188,7 +186,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
188186
; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 8
189187
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
190188
; IF-EVL-INLOOP: vector.body:
191-
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
192189
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
193190
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
194191
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -200,8 +197,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
200197
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
201198
; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
202199
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]]
203-
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
204-
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
200+
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
205201
; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
206202
; IF-EVL-INLOOP: middle.block:
207203
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
@@ -358,7 +354,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
358354
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
359355
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
360356
; IF-EVL-OUTLOOP: vector.body:
361-
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
362357
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
363358
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
364359
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -371,8 +366,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
371366
; IF-EVL-OUTLOOP-NEXT: [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
372367
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
373368
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
374-
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
375-
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
369+
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
376370
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
377371
; IF-EVL-OUTLOOP: middle.block:
378372
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -409,7 +403,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
409403
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
410404
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
411405
; IF-EVL-INLOOP: vector.body:
412-
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
413406
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
414407
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
415408
; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -421,9 +414,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
421414
; IF-EVL-INLOOP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP13]], i32 [[VEC_PHI]])
422415
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
423416
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
424-
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
425-
; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
426-
; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
417+
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
418+
; IF-EVL-INLOOP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
427419
; IF-EVL-INLOOP: middle.block:
428420
; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]]
429421
; IF-EVL-INLOOP: scalar.ph:

0 commit comments

Comments
 (0)