Skip to content

Commit cb4efe1

Browse files
committed
[VPlan] Don't trigger VF assertion if VPlan has extra simplifications.
There are cases where VPlans contain some simplifications that are very hard to accurately account for up-front in the legacy cost model. Those cases are caused by un-simplified inputs, which trigger the assert ensuring both the legacy and VPlan-based cost model agree on the VF. To avoid false positives due to missed simplifications in general, only trigger the assert if the chosen VPlan doesn't contain any additional simplifications. Fixes llvm#104714. Fixes llvm#105713.
1 parent a2d8743 commit cb4efe1

File tree

2 files changed

+186
-1
lines changed

2 files changed

+186
-1
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7237,6 +7237,56 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
72377237
return Cost;
72387238
}
72397239

7240+
/// Return true if the original loop \ TheLoop contains any instructions that do
7241+
/// not have corresponding recipes in \p Plan and are not marked to be ignored
7242+
/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7243+
/// cost-model did not account for.
7244+
static bool
7245+
planContainsAdditionalSimplifications(VPlan &Plan, ElementCount VF,
7246+
VPCostContext &CostCtx, Loop *TheLoop,
7247+
LoopVectorizationCostModel &CM) {
7248+
// First collect all instructions for the recipes in Plan.
7249+
auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction q {
7250+
if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7251+
return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7252+
if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7253+
return &WidenMem->getIngredient();
7254+
return nullptr;
7255+
};
7256+
7257+
DenseSet<Instruction *> SeenInstrs;
7258+
auto Iter = vp_depth_first_deep(Plan.getEntry());
7259+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7260+
for (VPRecipeBase &R : *VPBB) {
7261+
if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7262+
auto *IG = IR->getInterleaveGroup();
7263+
unsigned NumMembers = IG->getNumMembers();
7264+
for (unsigned I = 0; I != NumMembers; ++I) {
7265+
if (Instruction *M = IG->getMember(I))
7266+
SeenInstrs.insert(M);
7267+
}
7268+
continue;
7269+
}
7270+
if (Instruction *UI = GetInstructionForCost(&R))
7271+
SeenInstrs.insert(UI);
7272+
}
7273+
}
7274+
7275+
// Return true if the loop contains any instructions that are not also part of
7276+
// the VPlan or are skipped for VPlan-based cost computations. This indicates
7277+
// that the VPlan contains extra simplifications.
7278+
return any_of(
7279+
TheLoop->blocks(), [&SeenInstrs, VF, &CostCtx, &CM](BasicBlock *BB) {
7280+
return any_of(*BB, [&SeenInstrs, VF, &CostCtx, &CM](Instruction &I) {
7281+
if (isa<PHINode>(&I))
7282+
return false;
7283+
return !SeenInstrs.contains(&I) &&
7284+
!CostCtx.skipCostComputation(&I, true) &&
7285+
!CM.canTruncateToMinimalBitwidth(&I, VF);
7286+
});
7287+
});
7288+
}
7289+
72407290
VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
72417291
if (VPlans.empty())
72427292
return VectorizationFactor::Disabled();
@@ -7292,7 +7342,20 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
72927342
// cost-model and will be retired once the VPlan-based cost-model is
72937343
// stabilized.
72947344
VectorizationFactor LegacyVF = selectVectorizationFactor();
7295-
assert(BestFactor.Width == LegacyVF.Width &&
7345+
VPlan &BestPlan = getPlanFor(BestFactor.Width);
7346+
7347+
// Pre-compute the cost and use it to check if BestPlan contains any
7348+
// simplifications not accounted for in the legacy cost model. If that's the
7349+
// case, don't trigger the assertion, as the extra simplifications may cause a
7350+
// different VF to be picked by the VPlan-based cost model.
7351+
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7352+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
7353+
LLVMCtx, CM);
7354+
precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7355+
assert((BestFactor.Width == LegacyVF.Width ||
7356+
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7357+
BestFactor.Width, CostCtx,
7358+
OrigLoop, CM)) &&
72967359
" VPlan cost model and legacy cost model disagreed");
72977360
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
72987361
"when vectorizing, the scalar cost must be computed.");

llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,120 @@ exit:
827827
ret void
828828
}
829829

830+
; Test case for https://github.com/llvm/llvm-project/issues/100591.
831+
define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 {
832+
; CHECK-LABEL: define void @dead_load_in_block(
833+
; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
834+
; CHECK-NEXT: [[ENTRY:.*]]:
835+
; CHECK-NEXT: [[N_EXT:%.*]] = zext i8 [[N]] to i64
836+
; CHECK-NEXT: [[UMIN7:%.*]] = call i64 @llvm.umin.i64(i64 [[N_EXT]], i64 1)
837+
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N_EXT]], [[UMIN7]]
838+
; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], 3
839+
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN7]], [[TMP1]]
840+
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 1
841+
; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
842+
; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
843+
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.umax.i64(i64 40, i64 [[TMP5]])
844+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP6]]
845+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
846+
; CHECK: [[VECTOR_MEMCHECK]]:
847+
; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N_EXT]], i64 1)
848+
; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[N_EXT]], [[UMIN]]
849+
; CHECK-NEXT: [[TMP8:%.*]] = udiv i64 [[TMP7]], 3
850+
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[UMIN]], [[TMP8]]
851+
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 12
852+
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 4
853+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
854+
; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[X]], 2
855+
; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
856+
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 4
857+
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
858+
; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
859+
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
860+
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
861+
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
862+
; CHECK-NEXT: [[BOUND04:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP3]]
863+
; CHECK-NEXT: [[BOUND15:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
864+
; CHECK-NEXT: [[FOUND_CONFLICT6:%.*]] = and i1 [[BOUND04]], [[BOUND15]]
865+
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT6]]
866+
; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
867+
; CHECK: [[VECTOR_PH]]:
868+
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
869+
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 2
870+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP15]]
871+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
872+
; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 3
873+
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
874+
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2
875+
; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
876+
; CHECK-NEXT: [[TMP19:%.*]] = add <vscale x 2 x i64> [[TMP18]], zeroinitializer
877+
; CHECK-NEXT: [[TMP20:%.*]] = mul <vscale x 2 x i64> [[TMP19]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
878+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP20]]
879+
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
880+
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2
881+
; CHECK-NEXT: [[TMP23:%.*]] = mul i64 3, [[TMP22]]
882+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP23]], i64 0
883+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
884+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
885+
; CHECK: [[VECTOR_BODY]]:
886+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
887+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
888+
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], <vscale x 2 x i64> [[VEC_IND]]
889+
; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0(<vscale x 2 x i32> zeroinitializer, <vscale x 2 x ptr> [[TMP24]], i32 4, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)), !alias.scope [[META18:![0-9]+]], !noalias [[META21:![0-9]+]]
890+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
891+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
892+
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
893+
; CHECK-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
894+
; CHECK: [[MIDDLE_BLOCK]]:
895+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
896+
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
897+
; CHECK: [[SCALAR_PH]]:
898+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
899+
; CHECK-NEXT: br label %[[LOOP_HEADER:.*]]
900+
; CHECK: [[LOOP_HEADER]]:
901+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
902+
; CHECK-NEXT: [[L_0:%.*]] = load i32, ptr [[SRC]], align 4
903+
; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[L_0]], 0
904+
; CHECK-NEXT: br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
905+
; CHECK: [[THEN]]:
906+
; CHECK-NEXT: [[GEP_SRC_X:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[X]]
907+
; CHECK-NEXT: [[L_DEAD:%.*]] = load i32, ptr [[GEP_SRC_X]], align 4
908+
; CHECK-NEXT: br label %[[LOOP_LATCH]]
909+
; CHECK: [[LOOP_LATCH]]:
910+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]]
911+
; CHECK-NEXT: store i32 0, ptr [[GEP_DST]], align 4
912+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3
913+
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N_EXT]]
914+
; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
915+
; CHECK: [[EXIT]]:
916+
; CHECK-NEXT: ret void
917+
;
918+
entry:
919+
%N.ext = zext i8 %N to i64
920+
br label %loop.header
921+
922+
loop.header:
923+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
924+
%l.0 = load i32, ptr %src, align 4
925+
%c.0 = icmp eq i32 %l.0, 0
926+
br i1 %c.0, label %loop.latch , label %then
927+
928+
then:
929+
%gep.src.x = getelementptr i32, ptr %src, i64 %x
930+
%l.dead = load i32, ptr %gep.src.x, align 4
931+
br label %loop.latch
932+
933+
loop.latch:
934+
%gep.dst = getelementptr i32, ptr %dst, i64 %iv
935+
store i32 0, ptr %gep.dst, align 4
936+
%iv.next = add i64 %iv, 3
937+
%cmp = icmp ult i64 %iv, %N.ext
938+
br i1 %cmp, label %loop.header, label %exit
939+
940+
exit:
941+
ret void
942+
}
943+
830944
attributes #0 = { "target-features"="+64bit,+v" }
831945
;.
832946
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -847,4 +961,12 @@ attributes #0 = { "target-features"="+64bit,+v" }
847961
; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
848962
; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
849963
; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
964+
; CHECK: [[META18]] = !{[[META19:![0-9]+]]}
965+
; CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]]}
966+
; CHECK: [[META20]] = distinct !{[[META20]], !"LVerDomain"}
967+
; CHECK: [[META21]] = !{[[META22:![0-9]+]], [[META23:![0-9]+]]}
968+
; CHECK: [[META22]] = distinct !{[[META22]], [[META20]]}
969+
; CHECK: [[META23]] = distinct !{[[META23]], [[META20]]}
970+
; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
971+
; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]}
850972
;.

0 commit comments

Comments
 (0)