Skip to content

Commit 3a1f06c

Browse files
committed
[LoopVectorize] Further improve cost model for early exit loops
Following on from #125058, this patch takes into account the work done in the vector early exit block when assessing the profitability of vectorising the loop. I have renamed areRuntimeChecksProfitable to isOutsideLoopWorkProfitable and we now pass in the early exit costs. As part of this, I have added the ExtractFirstActive opcode to VPInstruction::computeCost. It's worth pointing out that when we assess profitability of the loop we calculate a minimum trip count and compare that against the *maximum* trip count. However, since the loop has an early exit the runtime trip count can still end up being less than the minimum. Alternatively, we may never take the early exit at all at runtime and so we have the opposite problem of over-estimating the cost of the loop. The loop vectoriser cannot simultaneously take two contradictory positions and so I feel the only sensible thing to do is be conservative and assume the loop will be more expensive than loops without early exits. We may find in future that we need to adjust the cost according to the probability of taking the early exit. This will become even more important once we support multiple early exits. However, we have to start somewhere and we can always revisit this later.
1 parent 976e413 commit 3a1f06c

File tree

4 files changed

+105
-12
lines changed

4 files changed

+105
-12
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10171,19 +10171,46 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1017110171
}
1017210172
}
1017310173

10174-
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10175-
VectorizationFactor &VF, Loop *L,
10176-
PredicatedScalarEvolution &PSE,
10177-
ScalarEpilogueLowering SEL,
10178-
std::optional<unsigned> VScale) {
10174+
static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
10175+
VPlan &Plan, ElementCount VF) {
10176+
InstructionCost Cost = 0;
10177+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), CM,
10178+
CM.CostKind);
10179+
LLVM_DEBUG(
10180+
dbgs() << "Calculating cost of work in vector early exit block:\n");
10181+
for (auto *ExitVPBB : Plan.getExitBlocks()) {
10182+
for (auto *PredVPBB : ExitVPBB->getPredecessors())
10183+
if (PredVPBB != Plan.getMiddleBlock())
10184+
for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
10185+
Cost += R.cost(VF, CostCtx);
10186+
}
10187+
return Cost;
10188+
}
10189+
10190+
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
10191+
VectorizationFactor &VF, Loop *L,
10192+
const TargetTransformInfo &TTI,
10193+
PredicatedScalarEvolution &PSE,
10194+
ScalarEpilogueLowering SEL,
10195+
std::optional<unsigned> VScale,
10196+
InstructionCost EarlyExitCost) {
1017910197
InstructionCost CheckCost = Checks.getCost();
10180-
if (!CheckCost.isValid())
10198+
if (!CheckCost.isValid() && !EarlyExitCost.isValid())
1018110199
return false;
1018210200

10201+
InstructionCost TotalCost = 0;
10202+
if (CheckCost.isValid())
10203+
TotalCost += CheckCost;
10204+
10205+
// Add on the cost of work required in the vector early exit block, if one
10206+
// exists.
10207+
if (EarlyExitCost.isValid())
10208+
TotalCost += EarlyExitCost;
10209+
1018310210
// When interleaving only scalar and vector cost will be equal, which in turn
1018410211
// would lead to a divide by 0. Fall back to hard threshold.
1018510212
if (VF.Width.isScalar()) {
10186-
if (CheckCost > VectorizeMemoryCheckThreshold) {
10213+
if (TotalCost > VectorizeMemoryCheckThreshold) {
1018710214
LLVM_DEBUG(
1018810215
dbgs()
1018910216
<< "LV: Interleaving only is not profitable due to runtime checks\n");
@@ -10227,7 +10254,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1022710254
// the computations are performed on doubles, not integers and the result
1022810255
// is rounded up, hence we get an upper estimate of the TC.
1022910256
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10230-
uint64_t RtC = *CheckCost.getValue();
10257+
uint64_t RtC = *TotalCost.getValue();
1023110258
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1023210259
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
1023310260

@@ -10555,8 +10582,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1055510582
// iteration count is low. However, setting the epilogue policy to
1055610583
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
1055710584
// with runtime checks. It's more effective to let
10558-
// `areRuntimeChecksProfitable` determine if vectorization is beneficial
10559-
// for the loop.
10585+
// `isOutsideLoopWorkProfitable` determine if vectorization is
10586+
// beneficial for the loop.
1056010587
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
1056110588
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
1056210589
} else {
@@ -10651,12 +10678,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1065110678
if (VF.Width.isVector() || SelectedIC > 1)
1065210679
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
1065310680

10681+
InstructionCost EarlyExitCost = InstructionCost::getInvalid();
10682+
if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
10683+
EarlyExitCost =
10684+
calculateEarlyExitCost(CM, LVP.getPlanFor(VF.Width), VF.Width);
10685+
1065410686
// Check if it is profitable to vectorize with runtime checks.
1065510687
bool ForceVectorization =
1065610688
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1065710689
if (!ForceVectorization &&
10658-
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10659-
CM.getVScaleForTuning())) {
10690+
!isOutsideLoopWorkProfitable(Checks, VF, L, *TTI, PSE, SEL,
10691+
CM.getVScaleForTuning(), EarlyExitCost)) {
1066010692
ORE->emit([&]() {
1066110693
return OptimizationRemarkAnalysisAliasing(
1066210694
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
742742
return Ctx.TTI.getArithmeticReductionCost(
743743
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
744744
}
745+
case VPInstruction::ExtractFirstActive: {
746+
// Calculate the cost of determining the lane index.
747+
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
748+
IntrinsicCostAttributes Attrs(
749+
Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx),
750+
{PoisonValue::get(PredTy), ConstantInt::getTrue(Ctx.LLVMCtx)});
751+
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
752+
// Add on the cost of extracting the element.
753+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
754+
Cost += Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
755+
Ctx.CostKind);
756+
return Cost;
757+
}
745758
default:
746759
// TODO: Compute cost other VPInstructions once the legacy cost model has
747760
// been retired.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; REQUIRES: asserts
3+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -disable-output \
4+
; RUN: -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK
5+
6+
target triple = "aarch64-unknown-linux-gnu"
7+
8+
declare void @init_mem(ptr, i64);
9+
10+
define i64 @same_exit_block_pre_inc_use1() #1 {
11+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
12+
; CHECK: LV: Selecting VF: vscale x 16
13+
; CHECK: Calculating cost of work in vector early exit block:
14+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
15+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
16+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
17+
entry:
18+
%p1 = alloca [1024 x i8]
19+
%p2 = alloca [1024 x i8]
20+
call void @init_mem(ptr %p1, i64 1024)
21+
call void @init_mem(ptr %p2, i64 1024)
22+
br label %loop
23+
24+
loop:
25+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
26+
%index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ]
27+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
28+
%ld1 = load i8, ptr %arrayidx, align 1
29+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
30+
%ld2 = load i8, ptr %arrayidx1, align 1
31+
%cmp3 = icmp eq i8 %ld1, %ld2
32+
br i1 %cmp3, label %loop.inc, label %loop.end
33+
34+
loop.inc:
35+
%index.next = add i64 %index, 1
36+
%index2.next = add i64 %index2, 2
37+
%exitcond = icmp ne i64 %index.next, 67
38+
br i1 %exitcond, label %loop, label %loop.end
39+
40+
loop.end:
41+
%val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
42+
%val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ]
43+
%retval = add i64 %val1, %val2
44+
ret i64 %retval
45+
}
46+
47+
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ define i64 @loop_contains_safe_div() #1 {
274274
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
275275
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
276276
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
277+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP12]])
277278
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
278279
; CHECK: vector.ph:
279280
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()

0 commit comments

Comments
 (0)