Skip to content

Commit bf04a43

Browse files
committed
[NFC][LoopVectorize] Cache result of requiresScalarEpilogue
Caching the decision returned by requiresScalarEpilogue means that we can avoid printing out the same debug many times, and also avoids repeating the same calculation. This function will get more complex when we start to reason about more early exit loops, such as in PR #88385. The only problem with this is we sometimes have to invalidate the previous result due to changes in the scalar epilogue status or interleave groups.
1 parent 2d70747 commit bf04a43

File tree

2 files changed

+60
-33
lines changed

2 files changed

+60
-33
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1371,27 +1371,45 @@ class LoopVectorizationCostModel {
13711371
return InterleaveInfo.getInterleaveGroup(Instr);
13721372
}
13731373

1374+
/// Calculate in advance whether a scalar epilogue is required when
1375+
/// vectorizing and not vectorizing. If \p Invalidate is true then
1376+
/// invalidate a previous decision.
1377+
void collectScalarEpilogueRequirements(bool Invalidate) {
1378+
auto NeedsScalarEpilogue = [&](bool IsVectorizing) -> bool {
1379+
if (!isScalarEpilogueAllowed()) {
1380+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1381+
return false;
1382+
}
1383+
// If we might exit from anywhere but the latch, must run the exiting
1384+
// iteration in scalar form.
1385+
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1386+
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: multiple exits");
1387+
return true;
1388+
}
1389+
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1390+
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1391+
"interleaved group requires scalar epilogue");
1392+
return true;
1393+
}
1394+
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue");
1395+
return false;
1396+
};
1397+
1398+
assert((Invalidate || !RequiresScalarEpilogue) &&
1399+
"Already determined scalar epilogue requirements!");
1400+
std::pair<bool,bool> Result;
1401+
Result.first = NeedsScalarEpilogue(true);
1402+
LLVM_DEBUG(dbgs() << ", when vectorizing\n");
1403+
Result.second = NeedsScalarEpilogue(false);
1404+
LLVM_DEBUG(dbgs() << ", when not vectorizing\n");
1405+
RequiresScalarEpilogue = Result;
1406+
}
1407+
13741408
/// Returns true if we're required to use a scalar epilogue for at least
13751409
/// the final iteration of the original loop.
13761410
bool requiresScalarEpilogue(bool IsVectorizing) const {
1377-
if (!isScalarEpilogueAllowed()) {
1378-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1379-
return false;
1380-
}
1381-
// If we might exit from anywhere but the latch, must run the exiting
1382-
// iteration in scalar form.
1383-
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1384-
LLVM_DEBUG(
1385-
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1386-
return true;
1387-
}
1388-
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1389-
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1390-
"interleaved group requires scalar epilogue\n");
1391-
return true;
1392-
}
1393-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1394-
return false;
1411+
auto &CachedResult = *RequiresScalarEpilogue;
1412+
return IsVectorizing ? CachedResult.first : CachedResult.second;
13951413
}
13961414

13971415
/// Returns true if we're required to use a scalar epilogue for at least
@@ -1415,6 +1433,15 @@ class LoopVectorizationCostModel {
14151433
return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
14161434
}
14171435

1436+
/// Update the ScalarEpilogueStatus to a new value, potentially triggering a
1437+
/// recalculation of the scalar epilogue requirements.
1438+
void setScalarEpilogueStatus(ScalarEpilogueLowering Status) {
1439+
bool Changed = ScalarEpilogueStatus != Status;
1440+
ScalarEpilogueStatus = Status;
1441+
if (Changed)
1442+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
1443+
}
1444+
14181445
/// Returns the TailFoldingStyle that is best for the current loop.
14191446
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
14201447
if (!ChosenTailFoldingStyle)
@@ -1767,6 +1794,9 @@ class LoopVectorizationCostModel {
17671794

17681795
/// All element types found in the loop.
17691796
SmallPtrSet<Type *, 16> ElementTypesInLoop;
1797+
1798+
/// Keeps track of whether we require a scalar epilogue.
1799+
std::optional<std::pair<bool,bool>> RequiresScalarEpilogue;
17701800
};
17711801
} // end namespace llvm
17721802

@@ -4034,7 +4064,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40344064
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
40354065
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
40364066
"scalar epilogue instead.\n");
4037-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4067+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
40384068
return computeFeasibleMaxVF(MaxTC, UserVF, false);
40394069
}
40404070
return FixedScalableVFPair::getNone();
@@ -4050,6 +4080,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40504080
// Note: There is no need to invalidate any cost modeling decisions here, as
40514081
// non where taken so far.
40524082
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4083+
collectScalarEpilogueRequirements(/*Invalidate=*/true);
40534084
}
40544085

40554086
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
@@ -4115,7 +4146,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
41154146
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
41164147
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
41174148
"scalar epilogue instead.\n");
4118-
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4149+
setScalarEpilogueStatus(CM_ScalarEpilogueAllowed);
41194150
return MaxFactors;
41204151
}
41214152

@@ -6957,6 +6988,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
69576988
if (!OrigLoop->isInnermost()) {
69586989
// If the user doesn't provide a vectorization factor, determine a
69596990
// reasonable one.
6991+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
69606992
if (UserVF.isZero()) {
69616993
VF = determineVPlanVF(TTI, CM);
69626994
LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
@@ -7001,6 +7033,7 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70017033

70027034
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
70037035
assert(OrigLoop->isInnermost() && "Inner loop expected.");
7036+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/false);
70047037
CM.collectValuesToIgnore();
70057038
CM.collectElementTypesForWidening();
70067039

@@ -7015,11 +7048,13 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
70157048
dbgs()
70167049
<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
70177050
"which requires masked-interleaved support.\n");
7018-
if (CM.InterleaveInfo.invalidateGroups())
7051+
if (CM.InterleaveInfo.invalidateGroups()) {
70197052
// Invalidating interleave groups also requires invalidating all decisions
70207053
// based on them, which includes widening decisions and uniform and scalar
70217054
// values.
70227055
CM.invalidateCostModelingDecisions();
7056+
CM.collectScalarEpilogueRequirements(/*Invalidate=*/true);
7057+
}
70237058
}
70247059

70257060
if (CM.foldTailByMasking())

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
1818
; CHECK-NEXT: LV: Found an induction variable.
1919
; CHECK-NEXT: LV: Did not find one integer induction var.
2020
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
21-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
21+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
22+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
2223
; CHECK-NEXT: LV: Found trip count: 0
2324
; CHECK-NEXT: LV: Scalable vectorization is available
2425
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
@@ -45,7 +46,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4546
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
4647
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4748
; CHECK-NEXT: LV: Using user VF vscale x 4.
48-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
4949
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5050
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5151
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
@@ -127,7 +127,6 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
127127
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
128128
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
129129
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
130-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
131130
; CHECK-NEXT: LV: Loop cost is 32
132131
; CHECK-NEXT: LV: IC is 1
133132
; CHECK-NEXT: LV: VF is vscale x 4
@@ -180,10 +179,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
180179
; CHECK-NEXT: scalar.ph:
181180
; CHECK-NEXT: No successors
182181
; CHECK-NEXT: }
183-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
184-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
185182
; CHECK-NEXT: LV: Interleaving disabled by the pass manager
186-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
187183
; CHECK-NEXT: LV: Vectorizing: innermost loop.
188184
; CHECK-EMPTY:
189185
;
@@ -222,7 +218,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
222218
; CHECK-NEXT: LV: Found FP op with unsafe algebra.
223219
; CHECK-NEXT: LV: Did not find one integer induction var.
224220
; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)!
225-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
221+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when vectorizing
222+
; CHECK-NEXT: LV: Loop does not require scalar epilogue, when not vectorizing
226223
; CHECK-NEXT: LV: Found trip count: 0
227224
; CHECK-NEXT: LV: Scalable vectorization is available
228225
; CHECK-NEXT: LV: The max safe fixed VF is: 67108864.
@@ -249,7 +246,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
249246
; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1
250247
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
251248
; CHECK-NEXT: LV: Using user VF vscale x 4.
252-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
253249
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
254250
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
255251
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
@@ -331,7 +327,6 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
331327
; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
332328
; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class
333329
; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class
334-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
335330
; CHECK-NEXT: LV: Loop cost is 34
336331
; CHECK-NEXT: LV: IC is 1
337332
; CHECK-NEXT: LV: VF is vscale x 4
@@ -384,10 +379,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
384379
; CHECK-NEXT: scalar.ph:
385380
; CHECK-NEXT: No successors
386381
; CHECK-NEXT: }
387-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
388-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
389382
; CHECK-NEXT: LV: Interleaving disabled by the pass manager
390-
; CHECK-NEXT: LV: Loop does not require scalar epilogue
391383
; CHECK-NEXT: LV: Vectorizing: innermost loop.
392384
;
393385
entry:

0 commit comments

Comments
 (0)