Skip to content

Commit cc8c941

Browse files
authored
[VPlan] Convert EVL loops to variable-length stepping after dissolution (#147222)
Loop regions require fixed-length steps and rounded-up trip counts, but after dissolution creates explicit control flow, EVL loops can leverage variable-length stepping with original trip counts. This patch adds a post-dissolution transform pass to convert EVL loops from fixed-length to variable-length stepping .
1 parent 33e978f commit cc8c941

33 files changed

+397
-527
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7315,6 +7315,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73157315
// Regions are dissolved after optimizing for VF and UF, which completely
73167316
// removes unneeded loop regions first.
73177317
VPlanTransforms::dissolveLoopRegions(BestVPlan);
7318+
// Canonicalize EVL loops after regions are dissolved.
7319+
VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
73187320
// Perform the actual loop transformation.
73197321
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
73207322
OrigLoop->getParentLoop(),

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2390,6 +2390,66 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
23902390
return true;
23912391
}
23922392

2393+
void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
2394+
using namespace llvm::VPlanPatternMatch;
2395+
// Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
2396+
// There should be only one EVL PHI in the entire plan.
2397+
VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
2398+
2399+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
2400+
vp_depth_first_shallow(Plan.getEntry())))
2401+
for (VPRecipeBase &R : VPBB->phis())
2402+
if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2403+
assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
2404+
EVLPhi = PhiR;
2405+
}
2406+
2407+
// Early return if no EVL PHI is found.
2408+
if (!EVLPhi)
2409+
return;
2410+
2411+
VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
2412+
VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
2413+
2414+
// Convert EVLPhi to concrete recipe.
2415+
auto *ScalarR =
2416+
VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
2417+
EVLPhi->getDebugLoc(), "evl.based.iv");
2418+
EVLPhi->replaceAllUsesWith(ScalarR);
2419+
EVLPhi->eraseFromParent();
2420+
2421+
// Replace CanonicalIVInc with EVL-PHI increment.
2422+
auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
2423+
VPValue *Backedge = CanonicalIV->getIncomingValue(1);
2424+
assert(match(Backedge,
2425+
m_c_Binary<Instruction::Add>(m_Specific(CanonicalIV),
2426+
m_Specific(&Plan.getVFxUF()))) &&
2427+
"Unexpected canonical iv");
2428+
Backedge->replaceAllUsesWith(EVLIncrement);
2429+
2430+
// Remove unused phi and increment.
2431+
VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
2432+
CanonicalIVIncrement->eraseFromParent();
2433+
CanonicalIV->eraseFromParent();
2434+
2435+
// Replace the use of VectorTripCount in the latch-exiting block.
2436+
// Before: (branch-on-count EVLIVInc, VectorTripCount)
2437+
// After: (branch-on-count EVLIVInc, TripCount)
2438+
2439+
VPBasicBlock *LatchExiting =
2440+
HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
2441+
auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
2442+
// Skip single-iteration loop region
2443+
if (match(LatchExitingBr, m_BranchOnCond(m_True())))
2444+
return;
2445+
assert(LatchExitingBr &&
2446+
match(LatchExitingBr,
2447+
m_BranchOnCount(m_VPValue(EVLIncrement),
2448+
m_Specific(&Plan.getVectorTripCount()))) &&
2449+
"Unexpected terminator in EVL loop");
2450+
LatchExitingBr->setOperand(1, Plan.getTripCount());
2451+
}
2452+
23932453
void VPlanTransforms::dropPoisonGeneratingRecipes(
23942454
VPlan &Plan,
23952455
const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
@@ -2721,15 +2781,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
27212781
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
27222782
vp_depth_first_deep(Plan.getEntry()))) {
27232783
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2724-
if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
2725-
auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
2726-
{PhiR->getStartValue(), PhiR->getBackedgeValue()},
2727-
PhiR->getDebugLoc(), "evl.based.iv");
2728-
PhiR->replaceAllUsesWith(ScalarR);
2729-
ToRemove.push_back(PhiR);
2730-
continue;
2731-
}
2732-
27332784
if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
27342785
expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
27352786
ToRemove.push_back(WidenIVR);

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,18 @@ struct VPlanTransforms {
209209
/// Replace loop regions with explicit CFG.
210210
static void dissolveLoopRegions(VPlan &Plan);
211211

212+
/// Transform EVL loops to use variable-length stepping after region
213+
/// dissolution.
214+
///
215+
/// Once loop regions are replaced with explicit CFG, EVL loops can step with
216+
/// variable vector lengths instead of fixed lengths. This transformation:
217+
/// * Makes EVL-Phi concrete.
218+
// * Removes CanonicalIV and increment.
219+
/// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
220+
/// VectorTripCount) with variable-length stepping (branch-on-cond
221+
/// EVLIVInc, TripCount).
222+
static void canonicalizeEVLLoops(VPlan &Plan);
223+
212224
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
213225
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
214226
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);

llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "VPlanCFG.h"
1818
#include "VPlanDominatorTree.h"
1919
#include "VPlanHelpers.h"
20+
#include "VPlanPatternMatch.h"
2021
#include "llvm/ADT/SmallPtrSet.h"
2122
#include "llvm/ADT/TypeSwitch.h"
2223

@@ -193,7 +194,13 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
193194
errs() << "EVL used by unexpected VPInstruction\n";
194195
return false;
195196
}
196-
if (I->getNumUsers() != 1) {
197+
// EVLIVIncrement is only used by EVLIV & BranchOnCount.
198+
// Having more than two users is unexpected.
199+
if ((I->getNumUsers() != 1) &&
200+
(I->getNumUsers() != 2 || none_of(I->users(), [&I](VPUser *U) {
201+
using namespace llvm::VPlanPatternMatch;
202+
return match(U, m_BranchOnCount(m_Specific(I), m_VPValue()));
203+
}))) {
197204
errs() << "EVL is used in VPInstruction with multiple users\n";
198205
return false;
199206
}

llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
2323
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
2424
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
2525
; CHECK: vector.body:
26-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
2726
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
2827
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
2928
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -36,9 +35,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
3635
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP14]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
3736
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
3837
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
39-
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
4038
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
41-
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
39+
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
4240
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4341
; CHECK: middle.block:
4442
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
130130
; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
131131
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
132132
; IF-EVL-OUTLOOP: vector.body:
133-
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
134133
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
135134
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
136135
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -141,8 +140,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
141140
; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP9]]
142141
; IF-EVL-OUTLOOP-NEXT: [[TMP10]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP5]])
143142
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
144-
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
145-
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
143+
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
146144
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
147145
; IF-EVL-OUTLOOP: middle.block:
148146
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
@@ -185,7 +183,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
185183
; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 8
186184
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
187185
; IF-EVL-INLOOP: vector.body:
188-
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
189186
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
190187
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
191188
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -196,8 +193,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
196193
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
197194
; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
198195
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]]
199-
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
200-
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
196+
; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
201197
; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
202198
; IF-EVL-INLOOP: middle.block:
203199
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
@@ -352,7 +348,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
352348
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
353349
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
354350
; IF-EVL-OUTLOOP: vector.body:
355-
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
356351
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
357352
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
358353
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -364,8 +359,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
364359
; IF-EVL-OUTLOOP-NEXT: [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
365360
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
366361
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
367-
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
368-
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
362+
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
369363
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
370364
; IF-EVL-OUTLOOP: middle.block:
371365
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -402,7 +396,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
402396
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
403397
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
404398
; IF-EVL-INLOOP: vector.body:
405-
; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
406399
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
407400
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
408401
; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -413,9 +406,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
413406
; IF-EVL-INLOOP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP13]], i32 [[VEC_PHI]])
414407
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
415408
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
416-
; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
417-
; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
418-
; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
409+
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
410+
; IF-EVL-INLOOP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
419411
; IF-EVL-INLOOP: middle.block:
420412
; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]]
421413
; IF-EVL-INLOOP: scalar.ph:

0 commit comments

Comments
 (0)