Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
994ac69
Precommit tests
lukel97 Sep 8, 2025
319b4e7
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor
lukel97 Sep 18, 2025
0b4a76e
Add comment to test
lukel97 Sep 23, 2025
bf4d92f
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 10, 2025
7718fe6
Reuse HeaderFreq + BBFreq
lukel97 Nov 10, 2025
6ee702b
Update comments
lukel97 Nov 18, 2025
0c92696
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 18, 2025
fb50e0f
Update x86 phase ordering test after merge
lukel97 Nov 18, 2025
134bff3
Regenerate tests with UTC --version 6 + add comments
lukel97 Nov 18, 2025
7723bd9
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 19, 2025
f2b5fce
Lazily fetch BFI to avoid compile time regressions
lukel97 Nov 19, 2025
32fbff4
Memoize GetBFI
lukel97 Nov 19, 2025
16c4b21
Restore new PM tests
lukel97 Nov 19, 2025
e09d1b3
Merge branch 'main' into loop-vectorize/bfi
lukel97 Nov 25, 2025
602fdfa
Remove duplicated VF 1 check lines
lukel97 Nov 25, 2025
9c00e81
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
bb66d78
Use branch_weights to remove diff in replicating-load-store-costs.ll
lukel97 Dec 1, 2025
95e55ef
Remove diff in struct-return
lukel97 Dec 1, 2025
0b5f3c5
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
1022c7f
Move CostKind check up
lukel97 Dec 1, 2025
296797f
Make GetBFI return a reference to match how other passes handle it
lukel97 Dec 1, 2025
7a29373
Memoize AM.getResult
lukel97 Dec 2, 2025
09340fe
Revert "Memoize AM.getResult"
lukel97 Dec 2, 2025
55cb35b
Add LoopVectorizeCostModel::getBFI
lukel97 Dec 2, 2025
42e3c38
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 2, 2025
70298e4
Round divisor
lukel97 Dec 2, 2025
e0b6b94
Remove most of the test diffs
lukel97 Dec 2, 2025
c4b3104
Include <cmath> for std::round
lukel97 Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ class LoopVectorizationLegality {

/// Return true if the block BB needs to be predicated in order for the loop
/// to be vectorized.
bool blockNeedsPredication(BasicBlock *BB) const;
bool blockNeedsPredication(const BasicBlock *BB) const;

/// Check if this pointer is consecutive when vectorizing. This happens
/// when the last index of the GEP is the induction variable, or that the
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ struct LoopVectorizePass : public PassInfoMixin<LoopVectorizePass> {
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
BlockFrequencyInfo *BFI;
std::function<BlockFrequencyInfo &()> GetBFI;
TargetLibraryInfo *TLI;
DemandedBits *DB;
AssumptionCache *AC;
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,8 @@ bool LoopVectorizationLegality::isFixedOrderRecurrence(
return FixedOrderRecurrences.count(Phi);
}

bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) const {
bool LoopVectorizationLegality::blockNeedsPredication(
const BasicBlock *BB) const {
// When vectorizing early exits, create predicates for the latch block only.
// The early exiting block must be a direct predecessor of the latch at the
// moment.
Expand Down
93 changes: 59 additions & 34 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdint>
#include <functional>
#include <iterator>
Expand Down Expand Up @@ -873,12 +874,14 @@ class LoopVectorizationCostModel {
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, const Function *F,
const LoopVectorizeHints *Hints,
OptimizationRemarkEmitter *ORE,
std::function<BlockFrequencyInfo &()> GetBFI,
const Function *F, const LoopVectorizeHints *Hints,
InterleavedAccessInfo &IAI, bool OptForSize)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI), OptForSize(OptForSize) {
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
OptForSize(OptForSize) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
Expand Down Expand Up @@ -1219,7 +1222,7 @@ class LoopVectorizationCostModel {
/// for which our chosen predication strategy is scalarization (i.e. we
/// don't have an alternate strategy such as masking available).
/// \p VF is the vectorization factor that will be used to vectorize \p I.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
bool isScalarWithPredication(Instruction *I, ElementCount VF);

/// Returns true if \p I is an instruction that needs to be predicated
/// at runtime. The result is independent of the predication mechanism.
Expand All @@ -1234,29 +1237,19 @@ class LoopVectorizationCostModel {
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
///
/// TODO: We should use actual block probability here, if available.
/// Currently, we always assume predicated blocks have a 50% chance of
/// executing, apart from blocks that are only predicated due to tail folding.
/// Note that if a block wasn't originally predicated but was predicated due
/// to tail folding, the divisor will still be 1 because it will execute for
/// every iteration of the loop header.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
BasicBlock *BB) const {
// If a block wasn't originally predicated but was predicated due to
// e.g. tail folding, don't divide the cost. Tail folded loops may still be
// predicated in the final vector loop iteration, but for most loops that
// don't have low trip counts we can expect their probability to be close to
// zero.
if (!Legal->blockNeedsPredication(BB))
return 1;
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
}
const BasicBlock *BB);

/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
/// First result is for scalarization (will be invalid for scalable
/// vectors); second is for the safe-divisor strategy.
std::pair<InstructionCost, InstructionCost>
getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const;
getDivRemSpeculationCost(Instruction *I, ElementCount VF);

/// Returns true if \p I is a memory instruction with consecutive memory
/// access that can be widened.
Expand Down Expand Up @@ -1729,6 +1722,17 @@ class LoopVectorizationCostModel {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

/// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
/// unless necessary, e.g. when the loop isn't legal to vectorize or when
/// there is no predication.
std::function<BlockFrequencyInfo &()> GetBFI;
BlockFrequencyInfo *BFI = nullptr;
BlockFrequencyInfo &getBFI() {
if (!BFI)
BFI = &GetBFI();
return *BFI;
}

const Function *TheFunction;

/// Loop Vectorize Hint.
Expand Down Expand Up @@ -2792,8 +2796,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
Scalars[VF].insert_range(Worklist);
}

bool LoopVectorizationCostModel::isScalarWithPredication(
Instruction *I, ElementCount VF) const {
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
ElementCount VF) {
if (!isPredicatedInst(I))
return false;

Expand Down Expand Up @@ -2886,9 +2890,26 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
}
}

unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
if (CostKind == TTI::TCK_CodeSize)
return 1;
// If the block wasn't originally predicated then return early to avoid
// computing BlockFrequencyInfo unnecessarily.
if (!Legal->blockNeedsPredication(BB))
return 1;

uint64_t HeaderFreq =
getBFI().getBlockFreq(TheLoop->getHeader()).getFrequency();
uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
assert(HeaderFreq >= BBFreq &&
"Header has smaller block freq than dominated BB?");
return std::round((double)HeaderFreq / BBFreq);
}

std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const {
ElementCount VF) {
assert(I->getOpcode() == Instruction::UDiv ||
I->getOpcode() == Instruction::SDiv ||
I->getOpcode() == Instruction::SRem ||
Expand Down Expand Up @@ -9167,8 +9188,9 @@ static bool processLoopInVPlanNativePath(
Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, bool OptForSize, LoopVectorizeHints &Hints,
LoopVectorizationRequirements &Requirements) {
OptimizationRemarkEmitter *ORE,
std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {

if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
Expand All @@ -9181,8 +9203,8 @@ static bool processLoopInVPlanNativePath(
ScalarEpilogueLowering SEL =
getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);

LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
&Hints, IAI, OptForSize);
LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
GetBFI, F, &Hints, IAI, OptForSize);
// Use the planner for outer loop vectorization.
// TODO: CM is not used at this point inside the planner. Turn CM into an
// optional argument if we don't need it in the future.
Expand Down Expand Up @@ -9882,8 +9904,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
bool OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
PGSOQueryType::IRPass);
bool OptForSize = llvm::shouldOptimizeForSize(
L->getHeader(), PSI,
PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
PGSOQueryType::IRPass);

// Check if it is legal to vectorize the loop.
LoopVectorizationRequirements Requirements;
Expand Down Expand Up @@ -9917,7 +9941,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// pipeline.
if (!L->isInnermost())
return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
ORE, OptForSize, Hints, Requirements);
ORE, GetBFI, OptForSize, Hints,
Requirements);

assert(L->isInnermost() && "Inner loop expected.");

Expand Down Expand Up @@ -10020,7 +10045,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Use the cost model.
LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
F, &Hints, IAI, OptForSize);
GetBFI, F, &Hints, IAI, OptForSize);
// Use the planner for vectorization.
LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
ORE);
Expand Down Expand Up @@ -10338,9 +10363,9 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
BFI = nullptr;
if (PSI && PSI->hasProfileSummary())
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
return AM.getResult<BlockFrequencyAnalysis>(F);
};
LoopVectorizeResult Result = runImpl(F);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
; CHECK-NEXT: LV: Too many memory checks needed.
entry:
%p1 = alloca [1024 x i8]
Expand Down Expand Up @@ -105,7 +105,7 @@ loop.header:
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
%l = load i64, ptr %gep.src, align 1
%t = trunc i64 %l to i1
br i1 %t, label %exit.0, label %loop.latch
br i1 %t, label %exit.0, label %loop.latch, !prof !0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is weird. I'd expect trunc i64 to i1 to give a probability of 50% given it's essentially asking for the likelihood of loading an odd-numbered value!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it looks like it's because this is branching to the loop exit, and BPI scales the weight down of the exiting branch by the trip count as a heuristic, which I guess makes sense:

if (isLoopExitingEdge(Edge) &&
// Avoid adjustment of ZERO weight since it should remain unchanged.
Weight != static_cast<uint32_t>(BlockExecWeight::ZERO)) {
// Scale down loop exiting weight by trip count.
Weight = std::max(
static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
TC);
}


loop.latch:
%iv.next = add i64 %iv, 1
Expand All @@ -120,4 +120,6 @@ exit.1:
ret i64 0
}

!0 = !{!"branch_weights", i32 1, i32 1}

attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
53 changes: 53 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,59 @@ attributes #1 = { "target-cpu"="neoverse-v2" }
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
!3 = !{!"llvm.loop.vectorize.enable", i1 true}

; BFI computes if is taken 20 times, and loop 32 times. Make sure we round the
; divisor up to 2 so that we don't vectorize the loop unprofitably.
define void @round_scalar_pred_divisor(ptr %dst, double %x) {
; CHECK-LABEL: define void @round_scalar_pred_divisor(
; CHECK-SAME: ptr [[DST:%.*]], double [[X:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
; CHECK-NEXT: [[C:%.*]] = fcmp une double [[X]], 0.000000e+00
; CHECK-NEXT: br i1 [[C]], label %[[IF:.*]], label %[[LATCH]]
; CHECK: [[IF]]:
; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[IV]] to i32
; CHECK-NEXT: [[UITOFP:%.*]] = uitofp i32 [[TRUNC]] to double
; CHECK-NEXT: [[SIN:%.*]] = tail call double @llvm.sin.f64(double [[UITOFP]])
; CHECK-NEXT: [[FPTRUNC:%.*]] = fptrunc double [[SIN]] to float
; CHECK-NEXT: br label %[[LATCH]]
; CHECK: [[LATCH]]:
; CHECK-NEXT: [[PHI:%.*]] = phi float [ [[FPTRUNC]], %[[IF]] ], [ 0.000000e+00, %[[LOOP]] ]
; CHECK-NEXT: store float [[PHI]], ptr [[DST]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 1024
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
%c = fcmp une double %x, 0.0
br i1 %c, label %if, label %latch

if:
%trunc = trunc i64 %iv to i32
%uitofp = uitofp i32 %trunc to double
%sin = tail call double @llvm.sin(double %uitofp)
%fptrunc = fptrunc double %sin to float
br label %latch

latch:
%phi = phi float [ %fptrunc, %if ], [ 0.0, %loop ]
store float %phi, ptr %dst
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, 1024
br i1 %ec, label %exit, label %loop

exit:
ret void
}

;.
; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ define i32 @diff_exit_block_needs_scev_check(i32 %end) {
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[END]] to i10
; CHECK-NEXT: [[TMP1:%.*]] = zext i10 [[TMP0]] to i64
; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1)
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 12
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX1]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[END_CLAMPED]], i32 1)
Expand Down
Loading
Loading