-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor #158690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 22 commits
994ac69
319b4e7
0b4a76e
bf4d92f
7718fe6
6ee702b
0c92696
fb50e0f
134bff3
7723bd9
f2b5fce
32fbff4
16c4b21
e09d1b3
602fdfa
9c00e81
bb66d78
95e55ef
0b5f3c5
1022c7f
296797f
7a29373
09340fe
55cb35b
42e3c38
70298e4
e0b6b94
c4b3104
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -873,12 +873,14 @@ class LoopVectorizationCostModel { | |
| const TargetTransformInfo &TTI, | ||
| const TargetLibraryInfo *TLI, DemandedBits *DB, | ||
| AssumptionCache *AC, | ||
| OptimizationRemarkEmitter *ORE, const Function *F, | ||
| const LoopVectorizeHints *Hints, | ||
| OptimizationRemarkEmitter *ORE, | ||
| std::function<BlockFrequencyInfo &()> GetBFI, | ||
| const Function *F, const LoopVectorizeHints *Hints, | ||
| InterleavedAccessInfo &IAI, bool OptForSize) | ||
| : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), | ||
| TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), | ||
| Hints(Hints), InterleaveInfo(IAI), OptForSize(OptForSize) { | ||
| TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI), | ||
| TheFunction(F), Hints(Hints), InterleaveInfo(IAI), | ||
| OptForSize(OptForSize) { | ||
| if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) | ||
| initializeVScaleForTuning(); | ||
| CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput; | ||
|
|
@@ -1234,21 +1236,12 @@ class LoopVectorizationCostModel { | |
| /// optimizing for code size it will just be 1 as code size costs don't depend | ||
| /// on execution probabilities. | ||
| /// | ||
| /// TODO: We should use actual block probability here, if available. | ||
| /// Currently, we always assume predicated blocks have a 50% chance of | ||
| /// executing, apart from blocks that are only predicated due to tail folding. | ||
| /// Note that if a block wasn't originally predicated but was predicated due | ||
| /// to tail folding, the divisor will still be 1 because it will execute for | ||
| /// every iteration of the loop header. | ||
| inline unsigned | ||
| getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, | ||
| BasicBlock *BB) const { | ||
| // If a block wasn't originally predicated but was predicated due to | ||
| // e.g. tail folding, don't divide the cost. Tail folded loops may still be | ||
| // predicated in the final vector loop iteration, but for most loops that | ||
| // don't have low trip counts we can expect their probability to be close to | ||
| // zero. | ||
| if (!Legal->blockNeedsPredication(BB)) | ||
| return 1; | ||
| return CostKind == TTI::TCK_CodeSize ? 1 : 2; | ||
| } | ||
| const BasicBlock *BB) const; | ||
|
|
||
| /// Return the costs for our two available strategies for lowering a | ||
| /// div/rem operation which requires speculating at least one lane. | ||
|
|
@@ -1729,6 +1722,11 @@ class LoopVectorizationCostModel { | |
| /// Interface to emit optimization remarks. | ||
| OptimizationRemarkEmitter *ORE; | ||
|
|
||
| /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it | ||
| /// unless necessary, e.g. when the loop isn't legal to vectorize or when | ||
| /// there is no predication. | ||
| std::function<BlockFrequencyInfo &()> GetBFI; | ||
|
|
||
| const Function *TheFunction; | ||
|
|
||
| /// Loop Vectorize Hint. | ||
|
|
@@ -2886,6 +2884,23 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { | |
| } | ||
| } | ||
|
|
||
| unsigned LoopVectorizationCostModel::getPredBlockCostDivisor( | ||
| TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const { | ||
| if (CostKind == TTI::TCK_CodeSize) | ||
lukel97 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return 1; | ||
| // If the block wasn't originally predicated then return early to avoid | ||
| // computing BlockFrequencyInfo unnecessarily. | ||
| if (!Legal->blockNeedsPredication(BB)) | ||
| return 1; | ||
|
|
||
| BlockFrequencyInfo &BFI = GetBFI(); | ||
| uint64_t HeaderFreq = BFI.getBlockFreq(TheLoop->getHeader()).getFrequency(); | ||
| uint64_t BBFreq = BFI.getBlockFreq(BB).getFrequency(); | ||
| assert(HeaderFreq >= BBFreq && | ||
| "Header has smaller block freq than dominated BB?"); | ||
| return HeaderFreq / BBFreq; | ||
|
||
| } | ||
|
|
||
| std::pair<InstructionCost, InstructionCost> | ||
| LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, | ||
| ElementCount VF) const { | ||
|
|
@@ -9166,8 +9181,9 @@ static bool processLoopInVPlanNativePath( | |
| Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, | ||
| LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, | ||
| TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, | ||
| OptimizationRemarkEmitter *ORE, bool OptForSize, LoopVectorizeHints &Hints, | ||
| LoopVectorizationRequirements &Requirements) { | ||
| OptimizationRemarkEmitter *ORE, | ||
| std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize, | ||
| LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { | ||
|
|
||
| if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) { | ||
| LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n"); | ||
|
|
@@ -9180,8 +9196,8 @@ static bool processLoopInVPlanNativePath( | |
| ScalarEpilogueLowering SEL = | ||
| getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI); | ||
|
|
||
| LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, | ||
| &Hints, IAI, OptForSize); | ||
| LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, | ||
| GetBFI, F, &Hints, IAI, OptForSize); | ||
| // Use the planner for outer loop vectorization. | ||
| // TODO: CM is not used at this point inside the planner. Turn CM into an | ||
| // optional argument if we don't need it in the future. | ||
|
|
@@ -9881,8 +9897,10 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
|
|
||
| // Query this against the original loop and save it here because the profile | ||
| // of the original loop header may change as the transformation happens. | ||
| bool OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI, | ||
| PGSOQueryType::IRPass); | ||
| bool OptForSize = llvm::shouldOptimizeForSize( | ||
| L->getHeader(), PSI, | ||
| PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr, | ||
| PGSOQueryType::IRPass); | ||
|
|
||
| // Check if it is legal to vectorize the loop. | ||
| LoopVectorizationRequirements Requirements; | ||
|
|
@@ -9916,7 +9934,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
| // pipeline. | ||
| if (!L->isInnermost()) | ||
| return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, | ||
| ORE, OptForSize, Hints, Requirements); | ||
| ORE, GetBFI, OptForSize, Hints, | ||
| Requirements); | ||
|
|
||
| assert(L->isInnermost() && "Inner loop expected."); | ||
|
|
||
|
|
@@ -10019,7 +10038,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |
|
|
||
| // Use the cost model. | ||
| LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, | ||
| F, &Hints, IAI, OptForSize); | ||
| GetBFI, F, &Hints, IAI, OptForSize); | ||
| // Use the planner for vectorization. | ||
| LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, | ||
| ORE); | ||
|
|
@@ -10337,9 +10356,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, | |
|
|
||
| auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); | ||
| PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); | ||
| BFI = nullptr; | ||
| if (PSI && PSI->hasProfileSummary()) | ||
| BFI = &AM.getResult<BlockFrequencyAnalysis>(F); | ||
| BlockFrequencyInfo *BFI = nullptr; | ||
| GetBFI = [&AM, &F, &BFI]() -> BlockFrequencyInfo & { | ||
| if (!BFI) | ||
| BFI = &AM.getResult<BlockFrequencyAnalysis>(F); | ||
| return *BFI; | ||
|
||
| }; | ||
| LoopVectorizeResult Result = runImpl(F); | ||
| if (!Result.MadeAnyChange) | ||
| return PreservedAnalyses::all(); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { | |
| ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ] | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] | ||
| ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1) | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 | ||
| ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] | ||
| ; CHECK: pred.udiv.if: | ||
|
|
@@ -65,7 +65,7 @@ for.body: | |
| %r = phi i64 [ 0, %entry ], [ %var6, %for.inc ] | ||
| %var0 = getelementptr inbounds i64, ptr %a, i64 %i | ||
| %var2 = load i64, ptr %var0, align 4 | ||
| %cond0 = icmp sgt i64 %var2, 0 | ||
| %cond0 = icmp sgt i64 %var2, 1 | ||
|
||
| br i1 %cond0, label %if.then, label %for.inc | ||
|
|
||
| if.then: | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() { | |||||||||||||||||||
| ; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3> | ||||||||||||||||||||
| ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add | ||||||||||||||||||||
| ; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV | ||||||||||||||||||||
| ; CHECK: LV: Minimum required TC for runtime checks to be profitable:160 | ||||||||||||||||||||
| ; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160) | ||||||||||||||||||||
| ; CHECK: LV: Minimum required TC for runtime checks to be profitable:128 | ||||||||||||||||||||
| ; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128) | ||||||||||||||||||||
| ; CHECK-NEXT: LV: Too many memory checks needed. | ||||||||||||||||||||
| entry: | ||||||||||||||||||||
| %p1 = alloca [1024 x i8] | ||||||||||||||||||||
|
|
@@ -105,7 +105,7 @@ loop.header: | |||||||||||||||||||
| %gep.src = getelementptr inbounds i64, ptr %src, i64 %iv | ||||||||||||||||||||
| %l = load i64, ptr %gep.src, align 1 | ||||||||||||||||||||
| %t = trunc i64 %l to i1 | ||||||||||||||||||||
| br i1 %t, label %exit.0, label %loop.latch | ||||||||||||||||||||
| br i1 %t, label %exit.0, label %loop.latch, !prof !0 | ||||||||||||||||||||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is weird. I'd expect trunc i64 to i1 to give a probability of 50% given it's essentially asking for the likelihood of loading an odd-numbered value!
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh it looks like it's because this is branching to the loop exit, and BPI scales the weight down of the exiting branch by the trip count as a heuristic, which I guess makes sense: llvm-project/llvm/lib/Analysis/BranchProbabilityInfo.cpp Lines 895 to 903 in 07ad928
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| loop.latch: | ||||||||||||||||||||
| %iv.next = add i64 %iv, 1 | ||||||||||||||||||||
|
|
@@ -120,4 +120,6 @@ exit.1: | |||||||||||||||||||
| ret i64 0 | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| !0 = !{!"branch_weights", i32 1, i32 1} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| attributes #1 = { "target-features"="+sve" vscale_range(1,16) } | ||||||||||||||||||||
Uh oh!
There was an error while loading. Please reload this page.