Skip to content

Commit 1de3dc7

Browse files
committed
[LV] Bail out early if BTC+1 wraps.
Currently we fail to detect the case where BTC + 1 wraps, i.e. the vector trip count is 0, In those cases, the minimum iteration count check will fail, and the vector code will never be executed. Explicitly check for this condition in computeMaxVF and avoid trying to vectorize alltogether. Note that a number of tests needed to be updated, because the vector loop would never be executed given the input IR. Fixes #122558.
1 parent 25f28dd commit 1de3dc7

27 files changed

+3247
-501
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4052,7 +4052,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40524052
return FixedScalableVFPair::getNone();
40534053
}
40544054

4055-
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4055+
ScalarEvolution *SE = PSE.getSE();
4056+
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
40564057
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
40574058
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
40584059
if (TC != MaxTC)
@@ -4064,6 +4065,22 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40644065
return FixedScalableVFPair::getNone();
40654066
}
40664067

4068+
// If BTC matches the widest induction type and is -1 then the trip count
4069+
// computation will wrap to 0 and the vector trip count will be 0. Do not try
4070+
// to vectorize.
4071+
const SCEV *BTC = SE->getBackedgeTakenCount(TheLoop);
4072+
if (!isa<SCEVCouldNotCompute>(BTC) &&
4073+
BTC->getType()->getScalarSizeInBits() >=
4074+
Legal->getWidestInductionType()->getScalarSizeInBits() &&
4075+
SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,
4076+
SE->getMinusOne(BTC->getType()))) {
4077+
reportVectorizationFailure(
4078+
"Trip count computation wrapped",
4079+
"backedge-taken count is -1, loop trip count wrapped to 0",
4080+
"TripCountWrapped", ORE, TheLoop);
4081+
return FixedScalableVFPair::getNone();
4082+
}
4083+
40674084
switch (ScalarEpilogueStatus) {
40684085
case CM_ScalarEpilogueAllowed:
40694086
return computeFeasibleMaxVF(MaxTC, UserVF, false);

llvm/test/Transforms/LoopVectorize/AArch64/loopvectorize_pr33804_double.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en
2626
%next19.i.i = getelementptr inbounds %struct.CvNode1D, ptr %dst, i32 %i.1424.i.i, i32 1
2727
store ptr %dst, ptr %next19.i.i, align 4
2828
%inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
29-
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
29+
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000
3030
br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
3131

3232
for.end22.i.i: ; preds = %for.body14.i.i
@@ -52,7 +52,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en
5252
%val.i.i = getelementptr inbounds %struct.CvNode1D2, ptr %arrayidx15.i.i1427, i32 0, i32 1
5353
store double 0xC415AF1D80000000, ptr %val.i.i, align 4
5454
%inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
55-
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
55+
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000
5656
br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
5757

5858
for.end22.i.i: ; preds = %for.body14.i.i
@@ -79,7 +79,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en
7979
store double %load_d, ptr %dst.ptr, align 4
8080
store ptr %load_p, ptr %dst.ptr.1, align 4
8181
%inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
82-
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
82+
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000
8383
br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
8484

8585
for.end22.i.i: ; preds = %for.body14.i.i
@@ -107,7 +107,7 @@ for.body14.i.i: ; preds = %for.body14.i.i, %en
107107
store double %load_d, ptr %dst.ptr, align 4
108108
store ptr %load_p, ptr %dst.ptr.1, align 4
109109
%inc21.i.i = add nuw nsw i32 %i.1424.i.i, 1
110-
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 0
110+
%exitcond438.i.i = icmp eq i32 %inc21.i.i, 1000
111111
br i1 %exitcond438.i.i, label %for.end22.i.i, label %for.body14.i.i
112112

113113
for.end22.i.i: ; preds = %for.body14.i.i

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
1010
; CHECK-NEXT: iter.check:
1111
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
1212
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
13-
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
13+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
1414
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
1515
; CHECK: vector.main.loop.iter.check:
16-
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
16+
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
1717
; CHECK: vector.ph:
1818
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1919
; CHECK: vector.body:
@@ -31,7 +31,7 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
3131
; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
3232
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
3333
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
34-
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
34+
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
3535
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
3636
; CHECK: middle.block:
3737
; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
@@ -42,12 +42,12 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
4242
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
4343
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
4444
; CHECK: vec.epilog.ph:
45-
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
45+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
4646
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
4747
; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
4848
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
49-
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
50-
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
49+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]]
50+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
5151
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
5252
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
5353
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
@@ -71,8 +71,29 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
7171
; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
7272
; CHECK: vec.epilog.middle.block:
7373
; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
74-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
74+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
7575
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
76+
; CHECK: vec.epilog.scalar.ph:
77+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK:%.*]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ]
78+
; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ]
79+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
80+
; CHECK: for.body:
81+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
82+
; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
83+
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
84+
; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
85+
; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
86+
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
87+
; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
88+
; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
89+
; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
90+
; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]]
91+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
92+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
93+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
94+
; CHECK: for.exit:
95+
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
96+
; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
7697
;
7798
entry:
7899
br label %for.body
@@ -89,7 +110,7 @@ for.body: ; preds = %for.body, %entry
89110
%mul = mul i32 %ext.b, %ext.a
90111
%add = add i32 %mul, %accum
91112
%iv.next = add i64 %iv, 1
92-
%exitcond.not = icmp eq i64 %iv.next, 0
113+
%exitcond.not = icmp eq i64 %iv.next, 1024
93114
br i1 %exitcond.not, label %for.exit, label %for.body
94115

95116
for.exit: ; preds = %for.body
@@ -211,3 +232,13 @@ while.end.loopexit: ; preds = %while.body
211232

212233
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
213234
attributes #1 = { "target-cpu"="apple-m1" }
235+
;.
236+
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
237+
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
238+
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
239+
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
240+
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
241+
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
242+
; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
243+
; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
244+
;.

0 commit comments

Comments
 (0)