Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
994ac69
Precommit tests
lukel97 Sep 8, 2025
319b4e7
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor
lukel97 Sep 18, 2025
0b4a76e
Add comment to test
lukel97 Sep 23, 2025
bf4d92f
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 10, 2025
7718fe6
Reuse HeaderFreq + BBFreq
lukel97 Nov 10, 2025
6ee702b
Update comments
lukel97 Nov 18, 2025
0c92696
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 18, 2025
fb50e0f
Update x86 phase ordering test after merge
lukel97 Nov 18, 2025
134bff3
Regenerate tests with UTC --version 6 + add comments
lukel97 Nov 18, 2025
7723bd9
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 19, 2025
f2b5fce
Lazily fetch BFI to avoid compile time regressions
lukel97 Nov 19, 2025
32fbff4
Memoize GetBFI
lukel97 Nov 19, 2025
16c4b21
Restore new PM tests
lukel97 Nov 19, 2025
e09d1b3
Merge branch 'main' into loop-vectorize/bfi
lukel97 Nov 25, 2025
602fdfa
Remove duplicated VF 1 check lines
lukel97 Nov 25, 2025
9c00e81
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
bb66d78
Use branch_weights to remove diff in replicating-load-store-costs.ll
lukel97 Dec 1, 2025
95e55ef
Remove diff in struct-return
lukel97 Dec 1, 2025
0b5f3c5
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
1022c7f
Move CostKind check up
lukel97 Dec 1, 2025
296797f
Make GetBFI return a reference to match how other passes handle it
lukel97 Dec 1, 2025
7a29373
Memoize AM.getResult
lukel97 Dec 2, 2025
09340fe
Revert "Memoize AM.getResult"
lukel97 Dec 2, 2025
55cb35b
Add LoopVectorizeCostModel::getBFI
lukel97 Dec 2, 2025
42e3c38
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 2, 2025
70298e4
Round divisor
lukel97 Dec 2, 2025
e0b6b94
Remove most of the test diffs
lukel97 Dec 2, 2025
c4b3104
Include <cmath> for std::round
lukel97 Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ class LoopVectorizationLegality {

/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication(BasicBlock *BB) const;
bool blockNeedsPredication(const BasicBlock *BB) const;

/// Check if this pointer is consecutive when vectorizing. This happens
/// when the last index of the GEP is the induction variable, or that the
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
std::function<BlockFrequencyInfo *()> GetBFI;
TargetLibraryInfo *TLI;
DemandedBits *DB;
AssumptionCache *AC;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
return FixedOrderRecurrences.count(Phi);
}

bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
bool LoopVectorizationLegality::blockNeedsPredication(
const BasicBlock *BB) const {
// When vectorizing early exits, create predicates for the latch block only.
// The early exiting block must be a direct predecessor of the latch at the
// moment.
Expand Down
72 changes: 44 additions & 28 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -873,12 +873,14 @@ class LoopVectorizationCostModel {
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
OptimizationRemarkEmitter *ORE,
std::function<BlockFrequencyInfo *()> GetBFI,
const Function *F, const LoopVectorizeHints *Hints,
InterleavedAccessInfo &IAI, bool OptForSize)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI), OptForSize(OptForSize) {
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
OptForSize(OptForSize) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
Expand Down Expand Up @@ -1234,21 +1236,12 @@ class LoopVectorizationCostModel {
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
///
/// TODO: We should use actual block probability here, if available.
/// Currently, we always assume predicated blocks have a 50% chance of
/// executing, apart from blocks that are only predicated due to tail folding.
/// Note that if a block wasn't originally predicated but was predicated due
/// to tail folding, the divisor will still be 1 because it will execute for
/// every iteration of the loop header.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
BasicBlock *BB) const {
// If a block wasn't originally predicated but was predicated due to
// e.g. tail folding, don't divide the cost. Tail folded loops may still be
// predicated in the final vector loop iteration, but for most loops that
// don't have low trip counts we can expect their probability to be close to
// zero.
if (!Legal->blockNeedsPredication(BB))
return 1;
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
}
const BasicBlock *BB) const;

/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
Expand Down Expand Up @@ -1729,6 +1722,11 @@ class LoopVectorizationCostModel {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

/// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
/// unless necessary, e.g. when the loop isn't legal to vectorize or when
/// there is no predication.
std::function<BlockFrequencyInfo *()> GetBFI;

const Function *TheFunction;

/// Loop Vectorize Hint.
Expand Down Expand Up @@ -2886,6 +2884,23 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
}
}

unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
// If the block wasn't originally predicated then return early to avoid
// computing BlockFrequencyInfo unnecessarily.
if (!Legal->blockNeedsPredication(BB))
return 1;
if (CostKind == TTI::TCK_CodeSize)
return 1;

BlockFrequencyInfo *BFI = GetBFI();
uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
assert(HeaderFreq >= BBFreq &&
"Header has smaller block freq than dominated BB?");
return HeaderFreq / BBFreq;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly the header frequency should really be the sum of all block frequencies in the loop? That's because it dominates all the other blocks, so in order to get to any block in the loop it has to go through the header. If so, then this calculation looks right to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly, as an aside I added that assert in because when I first started this PR there was a bug with the pass manager that meant the header sometimes had a smaller frequency, which led to divisions by zero. It was fixed in 4663d25

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Overall the number of changes with the patch I found are relatively small, but a few instances seems slightly unprofitable now, due to the fact that the unsigned division truncates toward zero, an example is https://llvm.godbolt.org/z/q46vM1eh3. where header frequency is 30, and predicated block frequency 20. We will return 1 and assume that the predicated block always executes in the scalar loop, same if the block frequency was 16.

Perhaps we could improve that by handling the remainder differently, e.g. return 2 between 1.5 and 2.5 ect or return as double, although not sure how big the changes would be for the latter

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've rounded the divisor now in 70298e4, this actually reduces the diff a good bit thanks. It also fixes the test case you linked, which is added now in AArch64/predicated-costs.ll

}

std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const {
Expand Down Expand Up @@ -9166,8 +9181,9 @@ static bool processLoopInVPlanNativePath(
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, bool OptForSize, LoopVectorizeHints &Hints,
LoopVectorizationRequirements &Requirements) {
OptimizationRemarkEmitter *ORE,
std::function<BlockFrequencyInfo *()> GetBFI, bool OptForSize,
LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {

if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
Expand All @@ -9180,8 +9196,8 @@ static bool processLoopInVPlanNativePath(
ScalarEpilogueLowering SEL =
getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI, OptForSize);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
GetBFI, F, &Hints, IAI, OptForSize);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
Expand Down Expand Up @@ -9881,8 +9897,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
bool OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
PGSOQueryType::IRPass);
bool OptForSize = llvm::shouldOptimizeForSize(
L->getHeader(), PSI, PSI && PSI->hasProfileSummary() ? GetBFI() : nullptr,
PGSOQueryType::IRPass);

// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
Expand Down Expand Up @@ -9916,7 +9933,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, OptForSize, Hints, Requirements);
ORE, GetBFI, OptForSize, Hints,
Requirements);

assert(L->isInnermost() && "Inner loop expected.");

Expand Down Expand Up @@ -10019,7 +10037,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI, OptForSize);
GetBFI, F, &Hints, IAI, OptForSize);
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
Expand Down Expand Up @@ -10337,9 +10355,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
BFI = nullptr;
if (PSI && PSI->hasProfileSummary())
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
GetBFI = [&AM, &F]() { return &AM.getResult<BlockFrequencyAnalysis>(F); };
LoopVectorizeResult Result = runImpl(F);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1)
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; CHECK: pred.udiv.if:
Expand Down Expand Up @@ -65,7 +65,7 @@ for.body:
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
%var0 = getelementptr inbounds i64, ptr %a, i64 %i
%var2 = load i64, ptr %var0, align 4
%cond0 = icmp sgt i64 %var2, 0
%cond0 = icmp sgt i64 %var2, 1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constant was changed to keep the old branch probability the same and keep the block scalarized, since with BFI icmp sgt %x, 0 is predicted to be slightly > 50%.

br i1 %cond0, label %if.then, label %for.inc

if.then:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1208,8 +1208,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
; CHECK-NEXT: LV: Too many memory checks needed.
entry:
%p1 = alloca [1024 x i8]
Expand Down Expand Up @@ -105,7 +105,7 @@ loop.header:
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
%l = load i64, ptr %gep.src, align 1
%t = trunc i64 %l to i1
br i1 %t, label %exit.0, label %loop.latch
br i1 %t, label %exit.0, label %loop.latch, !prof !0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is weird. I'd expect trunc i64 to i1 to give a probability of 50% given it's essentially asking for the likelihood of loading an odd-numbered value!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it looks like it's because this is branching to the loop exit, and BPI scales the weight down of the exiting branch by the trip count as a heuristic, which I guess makes sense:

if (isLoopExitingEdge(Edge) &&
// Avoid adjustment of ZERO weight since it should remain unchanged.
Weight != static_cast<uint32_t>(BlockExecWeight::ZERO)) {
// Scale down loop exiting weight by trip count.
Weight = std::max(
static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
TC);
}


loop.latch:
%iv.next = add i64 %iv, 1
Expand All @@ -120,4 +120,6 @@ exit.1:
ret i64 0
}

!0 = !{!"branch_weights", i32 1, i32 1}

attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ for.body: ; preds = %for.body.preheader,
%i.07 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, ptr %b, i64 %i.07
%0 = load i32, ptr %arrayidx, align 4
%tobool.not = icmp eq i32 %0, 0
%tobool.not = icmp eq i32 %0, 1
br i1 %tobool.not, label %for.inc, label %if.then

if.then: ; preds = %for.body
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[END]] to i10
; CHECK-NEXT: [[TMP1:%.*]] = zext i10 [[TMP0]] to i64
; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1)
Expand Down
Loading