Skip to content

Commit 9050b71

Browse files
committed
[NFC][LoopVectorize] Cache result of requiresScalarEpilogue
Caching the decision returned by requiresScalarEpilogue means that we can avoid printing out the same debug many times, and also avoids repeating the same calculation. This function will get more complex when we start to reason about more early exit loops, such as in PR #88385. The only problem with this is we sometimes have to invalidate the previous result due to changes in the scalar epilogue status or interleave groups.
1 parent 24b5941 commit 9050b71

File tree

2 files changed

+61
-33
lines changed

2 files changed

+61
-33
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,27 +1352,46 @@ class LoopVectorizationCostModel {
13521352
return InterleaveInfo.getInterleaveGroup(Instr);
13531353
}
13541354

1355+
/// Calculate in advance whether a scalar epilogue is required when
1356+
/// vectorizing and not vectorizing. If \p Invalidate is true then
1357+
/// invalidate a previous decision.
1358+
void collectScalarEpilogueRequirements(bool Invalidate) {
1359+
auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
1360+
if (!isScalarEpilogueAllowed()) {
1361+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1362+
return false;
1363+
}
1364+
// If we might exit from anywhere but the latch, must run the exiting
1365+
// iteration in scalar form.
1366+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1367+
LLVM_DEBUG(
1368+
dbgs() << "LV: Loop requires scalar epilogue: multiple exits");
1369+
return true;
1370+
}
1371+
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1372+
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1373+
"interleaved group requires scalar epilogue");
1374+
return true;
1375+
}
1376+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1377+
return false;
1378+
};
1379+
1380+
assert((Invalidate || !RequiresScalarEpilogue) &&
1381+
"Already determined scalar epilogue requirements!");
1382+
std::pair<bool, bool> Result;
1383+
Result.first = NeedsScalarEpilogue(true);
1384+
LLVM_DEBUG(dbgs() << ", when vectorizing\n");
1385+
Result.second = NeedsScalarEpilogue(false);
1386+
LLVM_DEBUG(dbgs() << ", when not vectorizing\n");
1387+
RequiresScalarEpilogue = Result;
1388+
}
1389+
13551390
/// Returns true if we're required to use a scalar epilogue for at least
13561391
/// the final iteration of the original loop.
13571392
bool requiresScalarEpilogue(bool IsVectorizing) const {
1358-
if (!isScalarEpilogueAllowed()) {
1359-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1360-
return false;
1361-
}
1362-
// If we might exit from anywhere but the latch, must run the exiting
1363-
// iteration in scalar form.
1364-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1365-
LLVM_DEBUG(
1366-
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1367-
return true;
1368-
}
1369-
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1370-
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1371-
"interleaved group requires scalar epilogue\n");
1372-
return true;
1373-
}
1374-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1375-
return false;
1393+
auto &CachedResult = *RequiresScalarEpilogue;
1394+
return IsVectorizing ? CachedResult.first : CachedResult.second;
13761395
}
13771396

13781397
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1396,6 +1415,15 @@ class LoopVectorizationCostModel {
13961415
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
13971416
}
13981417

1418+
/// Update the ScalarEpilogueStatus to a new value, potentially triggering a
1419+
/// recalculation of the scalar epilogue requirements.
1420+
void setScalarEpilogueStatus(ScalarEpilogueLowering Status) {
1421+
bool Changed = ScalarEpilogueStatus != Status;
1422+
ScalarEpilogueStatus = Status;
1423+
if (Changed)
1424+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
1425+
}
1426+
13991427
/// Returns the TailFoldingStyle that is best for the current loop.
14001428
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
14011429
if (!ChosenTailFoldingStyle)
@@ -1748,6 +1776,9 @@ class LoopVectorizationCostModel {
17481776

17491777
/// All element types found in the loop.
17501778
SmallPtrSet<Type *, 16> ElementTypesInLoop;
1779+
1780+
/// Keeps track of whether we require a scalar epilogue.
1781+
std::optional<std::pair<bool, bool>> RequiresScalarEpilogue;
17511782
};
17521783
} // end namespace llvm
17531784

@@ -4011,7 +4042,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40114042
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
40124043
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
40134044
"scalar epilogue instead.\n");
4014-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4045+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
40154046
return computeFeasibleMaxVF(MaxTC, UserVF, false);
40164047
}
40174048
return FixedScalableVFPair::getNone();
@@ -4027,6 +4058,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40274058
// Note: There is no need to invalidate any cost modeling decisions here, as
40284059
// non where taken so far.
40294060
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4061+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
40304062
}
40314063

40324064
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
@@ -4098,7 +4130,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40984130
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
40994131
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
41004132
"scalar epilogue instead.\n");
4101-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4133+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
41024134
return MaxFactors;
41034135
}
41044136

@@ -7006,6 +7038,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70067038
if (!OrigLoop->isInnermost()) {
70077039
// If the user doesn't provide a vectorization factor, determine a
70087040
// reasonable one.
7041+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
70097042
if (UserVF.isZero()) {
70107043
VF = determineVPlanVF(TTI, CM);
70117044
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7050,6 +7083,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70507083

70517084
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
70527085
assert(OrigLoop->isInnermost() && "Inner loop expected.");
7086+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
70537087
CM.collectValuesToIgnore();
70547088
CM.collectElementTypesForWidening();
70557089

@@ -7064,11 +7098,13 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
70647098
dbgs()
70657099
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
70667100
"which requires masked-interleaved support.\n");
7067-
if (CM.InterleaveInfo.invalidateGroups())
7101+
if (CM.InterleaveInfo.invalidateGroups()) {
70687102
// Invalidating interleave groups also requires invalidating all decisions
70697103
// based on them, which includes widening decisions and uniform and scalar
70707104
// values.
70717105
CM.invalidateCostModelingDecisions();
7106+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/true);
7107+
}
70727108
}
70737109

70747110
if (CM.foldTailByMasking())

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
1818
; CHECK-NEXT: LV: Found an induction variable.
1919
; CHECK-NEXT: LV: Did not find one integer induction var.
2020
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
21-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
21+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
22+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
2223
; CHECK-NEXT: LV: Found trip count: 0
2324
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
2425
; CHECK-NEXT: LV: Scalable vectorization is available
@@ -46,7 +47,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4647
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
4748
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4849
; CHECK-NEXT: LV: Using user VF vscale x 4.
49-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
5050
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5151
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5252
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -128,7 +128,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
128128
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
129129
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
130130
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
131-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
132131
; CHECK-NEXT: LV: Loop cost is 32
133132
; CHECK-NEXT: LV: IC is 1
134133
; CHECK-NEXT: LV: VF is vscale x 4
@@ -181,10 +180,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
181180
; CHECK-NEXT: scalar.ph:
182181
; CHECK-NEXT: No successors
183182
; CHECK-NEXT: }
184-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
185-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
186183
; CHECK-NEXT: LV: Interleaving disabled by the pass manager
187-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
188184
; CHECK-NEXT: LV: Vectorizing: innermost loop.
189185
; CHECK-EMPTY:
190186
;
@@ -223,7 +219,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
223219
; CHECK-NEXT: LV: Found FP op with unsafe algebra.
224220
; CHECK-NEXT: LV: Did not find one integer induction var.
225221
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
226-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
222+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
223+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
227224
; CHECK-NEXT: LV: Found trip count: 0
228225
; CHECK-NEXT: LV: Found maximum trip count: 4294967295
229226
; CHECK-NEXT: LV: Scalable vectorization is available
@@ -251,7 +248,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
251248
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
252249
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
253250
; CHECK-NEXT: LV: Using user VF vscale x 4.
254-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
255251
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
256252
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
257253
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
@@ -333,7 +329,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
333329
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
334330
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
335331
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
336-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
337332
; CHECK-NEXT: LV: Loop cost is 34
338333
; CHECK-NEXT: LV: IC is 1
339334
; CHECK-NEXT: LV: VF is vscale x 4
@@ -386,10 +381,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
386381
; CHECK-NEXT: scalar.ph:
387382
; CHECK-NEXT: No successors
388383
; CHECK-NEXT: }
389-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
390-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
391384
; CHECK-NEXT: LV: Interleaving disabled by the pass manager
392-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
393385
; CHECK-NEXT: LV: Vectorizing: innermost loop.
394386
;
395387
entry:

0 commit comments

Comments
 (0)