Skip to content

Commit c950a72

Browse files
committed
[VPlan] Support scalar VF for ExtractLane and FirstActiveLane.
Extend ExtractLane and FirstActiveLane to support scalable VFs. This allows correct handling when interleaving with VF = 1. Alive2 proofs: - Fixed codegen with this patch: https://alive2.llvm.org/ce/z/8Y5_Vc (verifies as correct) - Original codegen: https://alive2.llvm.org/ce/z/twdg3X (doesn't verify) Fixes llvm#154967.
1 parent 3cbbc07 commit c950a72

File tree

8 files changed

+83
-30
lines changed

8 files changed

+83
-30
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8634,8 +8634,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86348634
return !CM.requiresScalarEpilogue(VF.isVector());
86358635
},
86368636
Range);
8637-
VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit(),
8638-
Range);
8637+
VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
86398638
VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
86408639
CM.foldTailByMasking());
86418640

@@ -8926,7 +8925,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
89268925
OrigLoop, *LI, Legal->getWidestInductionType(),
89278926
getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
89288927
VPlanTransforms::handleEarlyExits(*Plan,
8929-
/*HasUncountableExit*/ false, Range);
8928+
/*HasUncountableExit*/ false);
89308929
VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
89318930
/*TailFolded*/ false);
89328931

llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -538,8 +538,7 @@ VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
538538
}
539539

540540
void VPlanTransforms::handleEarlyExits(VPlan &Plan,
541-
bool HasUncountableEarlyExit,
542-
VFRange &Range) {
541+
bool HasUncountableEarlyExit) {
543542
auto *MiddleVPBB = cast<VPBasicBlock>(
544543
Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
545544
auto *LatchVPBB = cast<VPBasicBlock>(MiddleVPBB->getSinglePredecessor());
@@ -559,8 +558,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
559558
assert(!HandledUncountableEarlyExit &&
560559
"can handle exactly one uncountable early exit");
561560
handleUncountableEarlyExit(cast<VPBasicBlock>(Pred), EB, Plan,
562-
cast<VPBasicBlock>(HeaderVPB), LatchVPBB,
563-
Range);
561+
cast<VPBasicBlock>(HeaderVPB), LatchVPBB);
564562
HandledUncountableEarlyExit = true;
565563
} else {
566564
for (VPRecipeBase &R : EB->phis())

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -919,8 +919,15 @@ Value *VPInstruction::generate(VPTransformState &State) {
919919
unsigned LastOpIdx = getNumOperands() - 1;
920920
Value *Res = nullptr;
921921
for (int Idx = LastOpIdx; Idx >= 0; --Idx) {
922-
Value *TrailingZeros = Builder.CreateCountTrailingZeroElems(
923-
Builder.getInt64Ty(), State.get(getOperand(Idx)), true, Name);
922+
Value *TrailingZeros =
923+
State.VF.isScalar()
924+
? Builder.CreateZExt(
925+
Builder.CreateICmpEQ(State.get(getOperand(Idx)),
926+
Builder.getFalse()),
927+
Builder.getInt64Ty())
928+
: Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
929+
State.get(getOperand(Idx)),
930+
true, Name);
924931
Value *Current = Builder.CreateAdd(
925932
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
926933
if (Res) {
@@ -1029,6 +1036,12 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
10291036
switch (getOpcode()) {
10301037
case Instruction::ExtractElement:
10311038
case VPInstruction::ExtractLane: {
1039+
if (VF.isScalar()) {
1040+
// ExtractLane with VF=1 takes care of handling extracting across multiple
1041+
// parts.
1042+
return 0;
1043+
}
1044+
10321045
// Add on the cost of extracting the element.
10331046
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
10341047
return Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
@@ -1040,8 +1053,13 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
10401053
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
10411054
}
10421055
case VPInstruction::FirstActiveLane: {
1056+
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
1057+
if (VF.isScalar())
1058+
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
1059+
CmpInst::makeCmpResultType(ScalarTy),
1060+
CmpInst::ICMP_EQ, Ctx.CostKind);
10431061
// Calculate the cost of determining the lane index.
1044-
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1062+
auto *PredTy = toVectorTy(ScalarTy, VF);
10451063
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
10461064
Type::getInt64Ty(Ctx.LLVMCtx),
10471065
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2983,9 +2983,11 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
29832983
R->eraseFromParent();
29842984
}
29852985

2986-
void VPlanTransforms::handleUncountableEarlyExit(
2987-
VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan,
2988-
VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB, VFRange &Range) {
2986+
void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
2987+
VPBasicBlock *EarlyExitVPBB,
2988+
VPlan &Plan,
2989+
VPBasicBlock *HeaderVPBB,
2990+
VPBasicBlock *LatchVPBB) {
29892991
VPBlockBase *MiddleVPBB = LatchVPBB->getSuccessors()[0];
29902992
if (!EarlyExitVPBB->getSinglePredecessor() &&
29912993
EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
@@ -3038,13 +3040,7 @@ void VPlanTransforms::handleUncountableEarlyExit(
30383040
}
30393041

30403042
VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3041-
auto IsVector = [](ElementCount VF) { return VF.isVector(); };
3042-
// When the VFs are vectors, need to add `extract` to get the incoming value
3043-
// from early exit. When the range contains scalar VF, limit the range to
3044-
// scalar VF to prevent mis-compilation for the range containing both scalar
3045-
// and vector VFs.
3046-
if (!IncomingFromEarlyExit->isLiveIn() &&
3047-
LoopVectorizationPlanner::getDecisionAndClampRange(IsVector, Range)) {
3043+
if (!IncomingFromEarlyExit->isLiveIn()) {
30483044
// Update the incoming value from the early exit.
30493045
VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
30503046
VPInstruction::FirstActiveLane, {CondToEarlyExit}, nullptr,

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ struct VPlanTransforms {
7070
PredicatedScalarEvolution &PSE);
7171

7272
/// Update \p Plan to account for all early exits.
73-
LLVM_ABI_FOR_TEST static void
74-
handleEarlyExits(VPlan &Plan, bool HasUncountableExit, VFRange &Range);
73+
LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
74+
bool HasUncountableExit);
7575

7676
/// If a check is needed to guard executing the scalar epilogue loop, it will
7777
/// be added to the middle block.
@@ -207,8 +207,7 @@ struct VPlanTransforms {
207207
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
208208
VPBasicBlock *EarlyExitVPBB,
209209
VPlan &Plan, VPBasicBlock *HeaderVPBB,
210-
VPBasicBlock *LatchVPBB,
211-
VFRange &Range);
210+
VPBasicBlock *LatchVPBB);
212211

213212
/// Replace loop regions with explicit CFG.
214213
static void dissolveLoopRegions(VPlan &Plan);

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %
9494
; CHECK-LABEL: LV: Checking a loop in 'vectorization_not_profitable_due_to_trunc'
9595
; CHECK: LV: Selecting VF: 1.
9696
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
97+
; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
98+
; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
9799
; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
98100
entry:
99101
br label %loop.header

llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-only.ll

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
3333
; CHECK: [[MIDDLE_BLOCK]]:
3434
; CHECK-NEXT: br label %[[RETURN:.*]]
3535
; CHECK: [[VECTOR_EARLY_EXIT]]:
36+
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i1 [[TMP8]], false
37+
; CHECK-NEXT: [[TMP33:%.*]] = zext i1 [[TMP32]] to i64
38+
; CHECK-NEXT: [[TMP12:%.*]] = add i64 1, [[TMP33]]
39+
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i1 [[TMP7]], false
40+
; CHECK-NEXT: [[TMP14:%.*]] = zext i1 [[TMP13]] to i64
41+
; CHECK-NEXT: [[TMP15:%.*]] = add i64 0, [[TMP14]]
42+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 1
43+
; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 [[TMP12]]
44+
; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
45+
; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], [[TMP18]]
46+
; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
47+
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i1 [[TMP8]], false
48+
; CHECK-NEXT: [[TMP22:%.*]] = zext i1 [[TMP21]] to i64
49+
; CHECK-NEXT: [[TMP23:%.*]] = add i64 1, [[TMP22]]
50+
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i1 [[TMP7]], false
51+
; CHECK-NEXT: [[TMP25:%.*]] = zext i1 [[TMP24]] to i64
52+
; CHECK-NEXT: [[TMP26:%.*]] = add i64 0, [[TMP25]]
53+
; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP25]], 1
54+
; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP27]], i64 [[TMP26]], i64 [[TMP23]]
55+
; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[TMP28]] to i32
56+
; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[INDEX]], [[TMP29]]
57+
; CHECK-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i8
3658
; CHECK-NEXT: br label %[[RETURN]]
3759
; CHECK: [[SCALAR_PH]]:
3860
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -47,8 +69,8 @@ define i8 @iv_used_in_exit_with_math(i8 noundef %g) {
4769
; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], 4
4870
; CHECK-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
4971
; CHECK: [[RETURN]]:
50-
; CHECK-NEXT: [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[OFFSET_IDX]], %[[VECTOR_EARLY_EXIT]] ]
51-
; CHECK-NEXT: [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[OFFSET_IDX]], %[[VECTOR_EARLY_EXIT]] ]
72+
; CHECK-NEXT: [[RES_IV1:%.*]] = phi i8 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP20]], %[[VECTOR_EARLY_EXIT]] ]
73+
; CHECK-NEXT: [[RES_IV2:%.*]] = phi i8 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP31]], %[[VECTOR_EARLY_EXIT]] ]
5274
; CHECK-NEXT: [[RES:%.*]] = add i8 [[RES_IV1]], [[RES_IV2]]
5375
; CHECK-NEXT: ret i8 [[RES]]
5476
;
@@ -102,6 +124,26 @@ define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
102124
; CHECK: [[MIDDLE_BLOCK]]:
103125
; CHECK-NEXT: br label %[[RETURN:.*]]
104126
; CHECK: [[VECTOR_EARLY_EXIT]]:
127+
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i1 [[TMP8]], false
128+
; CHECK-NEXT: [[TMP31:%.*]] = zext i1 [[TMP30]] to i64
129+
; CHECK-NEXT: [[TMP12:%.*]] = add i64 1, [[TMP31]]
130+
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i1 [[TMP7]], false
131+
; CHECK-NEXT: [[TMP14:%.*]] = zext i1 [[TMP13]] to i64
132+
; CHECK-NEXT: [[TMP15:%.*]] = add i64 0, [[TMP14]]
133+
; CHECK-NEXT: [[TMP16:%.*]] = icmp ne i64 [[TMP14]], 1
134+
; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 [[TMP12]]
135+
; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
136+
; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[INDEX]], [[TMP18]]
137+
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i1 [[TMP8]], false
138+
; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i64
139+
; CHECK-NEXT: [[TMP22:%.*]] = add i64 1, [[TMP21]]
140+
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i1 [[TMP7]], false
141+
; CHECK-NEXT: [[TMP24:%.*]] = zext i1 [[TMP23]] to i64
142+
; CHECK-NEXT: [[TMP25:%.*]] = add i64 0, [[TMP24]]
143+
; CHECK-NEXT: [[TMP26:%.*]] = icmp ne i64 [[TMP24]], 1
144+
; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 [[TMP22]]
145+
; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
146+
; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], [[TMP28]]
105147
; CHECK-NEXT: br label %[[RETURN]]
106148
; CHECK: [[SCALAR_PH]]:
107149
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
@@ -116,8 +158,8 @@ define i32 @iv_used_in_exit_with_loads(ptr align 4 dereferenceable(128) %src) {
116158
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 32
117159
; CHECK-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
118160
; CHECK: [[RETURN]]:
119-
; CHECK-NEXT: [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[INDEX]], %[[VECTOR_EARLY_EXIT]] ]
120-
; CHECK-NEXT: [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[INDEX]], %[[VECTOR_EARLY_EXIT]] ]
161+
; CHECK-NEXT: [[RES_IV1:%.*]] = phi i32 [ 32, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 32, %[[MIDDLE_BLOCK]] ], [ [[TMP19]], %[[VECTOR_EARLY_EXIT]] ]
162+
; CHECK-NEXT: [[RES_IV2:%.*]] = phi i32 [ 0, %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ 0, %[[MIDDLE_BLOCK]] ], [ [[TMP29]], %[[VECTOR_EARLY_EXIT]] ]
121163
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RES_IV1]], [[RES_IV2]]
122164
; CHECK-NEXT: ret i32 [[RES]]
123165
;

llvm/unittests/Transforms/Vectorize/VPlanTestBase.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,7 @@ class VPlanTestIRBase : public testing::Test {
7575
auto Plan = VPlanTransforms::buildVPlan0(L, *LI, IntegerType::get(*Ctx, 64),
7676
{}, PSE);
7777

78-
VFRange R(ElementCount::getFixed(1), ElementCount::getFixed(2));
79-
VPlanTransforms::handleEarlyExits(*Plan, false, R);
78+
VPlanTransforms::handleEarlyExits(*Plan, false);
8079
VPlanTransforms::addMiddleCheck(*Plan, true, false);
8180

8281
VPlanTransforms::createLoopRegions(*Plan);

0 commit comments

Comments
 (0)