Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
994ac69
Precommit tests
lukel97 Sep 8, 2025
319b4e7
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor
lukel97 Sep 18, 2025
0b4a76e
Add comment to test
lukel97 Sep 23, 2025
bf4d92f
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 10, 2025
7718fe6
Reuse HeaderFreq + BBFreq
lukel97 Nov 10, 2025
6ee702b
Update comments
lukel97 Nov 18, 2025
0c92696
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 18, 2025
fb50e0f
Update x86 phase ordering test after merge
lukel97 Nov 18, 2025
134bff3
Regenerate tests with UTC --version 6 + add comments
lukel97 Nov 18, 2025
7723bd9
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Nov 19, 2025
f2b5fce
Lazily fetch BFI to avoid compile time regressions
lukel97 Nov 19, 2025
32fbff4
Memoize GetBFI
lukel97 Nov 19, 2025
16c4b21
Restore new PM tests
lukel97 Nov 19, 2025
e09d1b3
Merge branch 'main' into loop-vectorize/bfi
lukel97 Nov 25, 2025
602fdfa
Remove duplicated VF 1 check lines
lukel97 Nov 25, 2025
9c00e81
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
bb66d78
Use branch_weights to remove diff in replicating-load-store-costs.ll
lukel97 Dec 1, 2025
95e55ef
Remove diff in struct-return
lukel97 Dec 1, 2025
0b5f3c5
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 1, 2025
1022c7f
Move CostKind check up
lukel97 Dec 1, 2025
296797f
Make GetBFI return a reference to match how other passes handle it
lukel97 Dec 1, 2025
7a29373
Memoize AM.getResult
lukel97 Dec 2, 2025
09340fe
Revert "Memoize AM.getResult"
lukel97 Dec 2, 2025
55cb35b
Add LoopVectorizeCostModel::getBFI
lukel97 Dec 2, 2025
42e3c38
Merge branch 'main' of github.com:llvm/llvm-project into loop-vectori…
lukel97 Dec 2, 2025
70298e4
Round divisor
lukel97 Dec 2, 2025
e0b6b94
Remove most of the test diffs
lukel97 Dec 2, 2025
c4b3104
Include <cmath> for std::round
lukel97 Dec 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 41 additions & 9 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -578,8 +578,10 @@ class InnerLoopVectorizer {
/// The profitablity analysis.
LoopVectorizationCostModel *Cost;

/// BFI and PSI are used to check for profile guided size optimizations.
/// Used to calculate the probability of predicated blocks in
/// getPredBlockCostDivisor.
BlockFrequencyInfo *BFI;
/// Used to check for profile guided size optimizations.
ProfileSummaryInfo *PSI;

/// Structure to hold information about generated runtime checks, responsible
Expand Down Expand Up @@ -900,7 +902,7 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI,
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
Expand Down Expand Up @@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel {
/// Superset of instructions that return true for isScalarWithPredication.
bool isPredicatedInst(Instruction *I) const;

/// A helper function that returns how much we should divide the cost of a
/// predicated block by. Typically this is the reciprocal of the block
/// probability, i.e. if we return X we are assuming the predicated block will
/// execute once for every X iterations of the loop header so the block should
/// only contribute 1/X of its cost to the total cost calculation, but when
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
const BasicBlock *BB) const;

/// Return the costs for our two available strategies for lowering a
/// div/rem operation which requires speculating at least one lane.
/// First result is for scalarization (will be invalid for scalable
Expand Down Expand Up @@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel {
/// Interface to emit optimization remarks.
OptimizationRemarkEmitter *ORE;

const BlockFrequencyInfo *BFI;

const Function *TheFunction;

/// Loop Vectorize Hint.
Expand Down Expand Up @@ -2866,6 +2881,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
}
}

unsigned LoopVectorizationCostModel::getPredBlockCostDivisor(
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
if (CostKind == TTI::TCK_CodeSize)
return 1;

uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency();
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency();
assert(HeaderFreq >= BBFreq &&
"Header has smaller block freq than dominated BB?");
return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() /
BFI->getBlockFreq(BB).getFrequency();
}

std::pair<InstructionCost, InstructionCost>
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
ElementCount VF) const {
Expand Down Expand Up @@ -2902,7 +2930,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
ScalarizationCost =
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
}

InstructionCost SafeDivisorCost = 0;
Expand Down Expand Up @@ -5035,7 +5064,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
}

// Scale the total scalar cost by block probability.
ScalarCost /= getPredBlockCostDivisor(CostKind);
ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent());

// Compute the discount. A non-negative discount means the vector version
// of the instruction costs more, and scalarizing would be beneficial.
Expand Down Expand Up @@ -5088,7 +5117,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
BlockCost /= getPredBlockCostDivisor(CostKind);
BlockCost /= getPredBlockCostDivisor(CostKind, BB);

Cost += BlockCost;
}
Expand Down Expand Up @@ -5167,7 +5196,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// conditional branches, but may not be executed for each vector lane. Scale
// the cost by the probability of executing the predicated block.
if (isPredicatedInst(I)) {
Cost /= getPredBlockCostDivisor(CostKind);
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());

// Add the cost of an i1 extract and a branch
auto *VecI1Ty =
Expand Down Expand Up @@ -6727,6 +6756,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
SkipCostComputation.contains(UI);
}

unsigned VPCostContext::getPredBlockCostDivisor(
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const {
return CM.getPredBlockCostDivisor(CostKind, BB);
}

InstructionCost
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
VPCostContext &CostCtx) const {
Expand Down Expand Up @@ -10310,9 +10344,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,

auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
BFI = nullptr;
if (PSI && PSI->hasProfileSummary())
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
LoopVectorizeResult Result = runImpl(F);
if (!Result.MadeAnyChange)
return PreservedAnalyses::all();
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
if (VF.isScalar())
return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
VPIRBB->getIRBasicBlock());

return ThenCost;
}
Expand Down
18 changes: 3 additions & 15 deletions llvm/lib/Transforms/Vectorize/VPlanHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
int64_t Step);

/// A helper function that returns how much we should divide the cost of a
/// predicated block by. Typically this is the reciprocal of the block
/// probability, i.e. if we return X we are assuming the predicated block will
/// execute once for every X iterations of the loop header so the block should
/// only contribute 1/X of its cost to the total cost calculation, but when
/// optimizing for code size it will just be 1 as code size costs don't depend
/// on execution probabilities.
///
/// TODO: We should use actual block probability here, if available. Currently,
/// we always assume predicated blocks have a 50% chance of executing.
inline unsigned
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
}

/// A range of powers-of-2 vectorization factors with fixed start and
/// adjustable end. The range includes start and excludes end, e.g.,:
/// [1, 16) = {1, 2, 4, 8}
Expand Down Expand Up @@ -378,6 +363,9 @@ struct VPCostContext {
InstructionCost getScalarizationOverhead(Type *ResultTy,
ArrayRef<const VPValue *> Operands,
ElementCount VF);

unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
const BasicBlock *BB) const;
};

/// This class can be used to assign names to VPValues. For VPValues without
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
// Scale the cost by the probability of executing the predicated blocks.
// This assumes the predicated block for each vector lane is equally
// likely.
ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent());
return ScalarCost;
}
case Instruction::Load:
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,8 @@
; CHECK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-O-NEXT: Running pass: InferAlignmentPass
; CHECK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-O-NEXT: Running pass: InstCombinePass
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-lto-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@
; CHECK-O23SZ-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-O23SZ-NEXT: Running pass: LoopVectorizePass on foo
; CHECK-O23SZ-NEXT: Running analysis: DemandedBitsAnalysis on foo
; CHECK-O23SZ-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-O23SZ-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-O23SZ-NEXT: Running pass: InferAlignmentPass on foo
; CHECK-O23SZ-NEXT: Running pass: LoopUnrollPass on foo
; CHECK-O23SZ-NEXT: WarnMissedTransformationsPass on foo
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@
; CHECK-POSTLINK-O-NEXT: Running analysis: LoopAccessAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running pass: InjectTLIMappings
; CHECK-POSTLINK-O-NEXT: Running pass: LoopVectorizePass
; CHECK-POSTLINK-O-NEXT: Running analysis: BlockFrequencyAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running analysis: BranchProbabilityAnalysis on foo
; CHECK-POSTLINK-O-NEXT: Running pass: InferAlignmentPass
; CHECK-POSTLINK-O-NEXT: Running pass: LoopLoadEliminationPass
; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) {
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1)
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; CHECK: pred.udiv.if:
Expand Down Expand Up @@ -65,7 +65,7 @@ for.body:
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ]
%var0 = getelementptr inbounds i64, ptr %a, i64 %i
%var2 = load i64, ptr %var0, align 4
%cond0 = icmp sgt i64 %var2, 0
%cond0 = icmp sgt i64 %var2, 1
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This constant was changed to keep the old branch probability the same and keep the block scalarized, since with BFI icmp sgt %x, 0 is predicted to be slightly > 50%.

br i1 %cond0, label %if.then, label %for.inc

if.then:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
;
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
; COMMON-SAME: ptr [[DST:%.*]]) {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
; COMMON: [[VECTOR_PH]]:
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
; COMMON: [[PRED_STORE_IF]]:
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
; COMMON: [[PRED_STORE_CONTINUE]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
; COMMON: [[PRED_STORE_IF1]]:
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
; COMMON: [[PRED_STORE_CONTINUE2]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
; COMMON: [[PRED_STORE_IF3]]:
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
; COMMON: [[PRED_STORE_CONTINUE4]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
; COMMON: [[PRED_STORE_IF5]]:
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
; COMMON: [[PRED_STORE_CONTINUE6]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
; COMMON: [[PRED_STORE_IF7]]:
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
; COMMON: [[PRED_STORE_CONTINUE8]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
; COMMON: [[PRED_STORE_IF9]]:
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
; COMMON: [[PRED_STORE_CONTINUE10]]:
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
; COMMON: [[PRED_STORE_IF11]]:
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
; COMMON-NEXT: br label %[[EXIT]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
; COMMON: [[SCALAR_PH]]:
; COMMON-NEXT: br [[EXIT1:label %.*]]
; COMMON: [[SCALAR_PH1:.*:]]
; COMMON-NEXT: [[ENTRY:.*]]:
; COMMON-NEXT: br label %[[EXIT1:.*]]
; COMMON: [[EXIT1]]:
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]]
; COMMON: [[SCALAR_PH1]]:
; COMMON-NEXT: ret void
;
entry:
br label %loop
Expand Down Expand Up @@ -1241,8 +1196,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; DEFAULT: [[VECTOR_MEMCHECK]]:
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() {
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3>
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160)
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128)
; CHECK-NEXT: LV: Too many memory checks needed.
entry:
%p1 = alloca [1024 x i8]
Expand Down Expand Up @@ -105,7 +105,7 @@ loop.header:
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv
%l = load i64, ptr %gep.src, align 1
%t = trunc i64 %l to i1
br i1 %t, label %exit.0, label %loop.latch
br i1 %t, label %exit.0, label %loop.latch, !prof !0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is weird. I'd expect trunc i64 to i1 to give a probability of 50% given it's essentially asking for the likelihood of loading an odd-numbered value!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh it looks like it's because this is branching to the loop exit, and BPI scales the weight down of the exiting branch by the trip count as a heuristic, which I guess makes sense:

if (isLoopExitingEdge(Edge) &&
// Avoid adjustment of ZERO weight since it should remain unchanged.
Weight != static_cast<uint32_t>(BlockExecWeight::ZERO)) {
// Scale down loop exiting weight by trip count.
Weight = std::max(
static_cast<uint32_t>(BlockExecWeight::LOWEST_NON_ZERO),
Weight.value_or(static_cast<uint32_t>(BlockExecWeight::DEFAULT)) /
TC);
}


loop.latch:
%iv.next = add i64 %iv, 1
Expand All @@ -120,4 +120,6 @@ exit.1:
ret i64 0
}

!0 = !{!"branch_weights", i32 1, i32 1}

attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
Loading
Loading