-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[VPlan] Use BlockFrequencyInfo in getPredBlockCostDivisor #158690
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -578,8 +578,10 @@ class InnerLoopVectorizer { | |
/// The profitablity analysis. | ||
LoopVectorizationCostModel *Cost; | ||
|
||
/// BFI and PSI are used to check for profile guided size optimizations. | ||
/// Used to calculate the probability of predicated blocks in | ||
/// getPredBlockCostDivisor. | ||
BlockFrequencyInfo *BFI; | ||
/// Used to check for profile guided size optimizations. | ||
ProfileSummaryInfo *PSI; | ||
|
||
/// Structure to hold information about generated runtime checks, responsible | ||
|
@@ -900,7 +902,7 @@ class LoopVectorizationCostModel { | |
InterleavedAccessInfo &IAI, | ||
ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) | ||
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), | ||
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), | ||
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), BFI(BFI), TheFunction(F), | ||
Hints(Hints), InterleaveInfo(IAI) { | ||
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) | ||
initializeVScaleForTuning(); | ||
|
@@ -1249,6 +1251,17 @@ class LoopVectorizationCostModel { | |
/// Superset of instructions that return true for isScalarWithPredication. | ||
bool isPredicatedInst(Instruction *I) const; | ||
|
||
/// A helper function that returns how much we should divide the cost of a | ||
/// predicated block by. Typically this is the reciprocal of the block | ||
/// probability, i.e. if we return X we are assuming the predicated block will | ||
/// execute once for every X iterations of the loop header so the block should | ||
/// only contribute 1/X of its cost to the total cost calculation, but when | ||
/// optimizing for code size it will just be 1 as code size costs don't depend | ||
/// on execution probabilities. | ||
inline unsigned | ||
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind, | ||
const BasicBlock *BB) const; | ||
|
||
/// Return the costs for our two available strategies for lowering a | ||
/// div/rem operation which requires speculating at least one lane. | ||
/// First result is for scalarization (will be invalid for scalable | ||
|
@@ -1711,6 +1724,8 @@ class LoopVectorizationCostModel { | |
/// Interface to emit optimization remarks. | ||
OptimizationRemarkEmitter *ORE; | ||
|
||
const BlockFrequencyInfo *BFI; | ||
|
||
const Function *TheFunction; | ||
|
||
/// Loop Vectorize Hint. | ||
|
@@ -2866,6 +2881,19 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { | |
} | ||
} | ||
|
||
unsigned LoopVectorizationCostModel::getPredBlockCostDivisor( | ||
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const { | ||
if (CostKind == TTI::TCK_CodeSize) | ||
return 1; | ||
|
||
uint64_t HeaderFreq = BFI->getBlockFreq(TheLoop->getHeader()).getFrequency(); | ||
uint64_t BBFreq = BFI->getBlockFreq(BB).getFrequency(); | ||
assert(HeaderFreq >= BBFreq && | ||
"Header has smaller block freq than dominated BB?"); | ||
return BFI->getBlockFreq(TheLoop->getHeader()).getFrequency() / | ||
BFI->getBlockFreq(BB).getFrequency(); | ||
} | ||
|
||
std::pair<InstructionCost, InstructionCost> | ||
LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, | ||
ElementCount VF) const { | ||
|
@@ -2902,7 +2930,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, | |
// Scale the cost by the probability of executing the predicated blocks. | ||
// This assumes the predicated block for each vector lane is equally | ||
// likely. | ||
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind); | ||
ScalarizationCost = | ||
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent()); | ||
} | ||
|
||
InstructionCost SafeDivisorCost = 0; | ||
|
@@ -5035,7 +5064,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount( | |
} | ||
|
||
// Scale the total scalar cost by block probability. | ||
ScalarCost /= getPredBlockCostDivisor(CostKind); | ||
ScalarCost /= getPredBlockCostDivisor(CostKind, PredInst->getParent()); | ||
|
||
// Compute the discount. A non-negative discount means the vector version | ||
// of the instruction costs more, and scalarizing would be beneficial. | ||
|
@@ -5088,7 +5117,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) { | |
// cost by the probability of executing it. blockNeedsPredication from | ||
// Legal is used so as to not include all blocks in tail folded loops. | ||
if (VF.isScalar() && Legal->blockNeedsPredication(BB)) | ||
BlockCost /= getPredBlockCostDivisor(CostKind); | ||
BlockCost /= getPredBlockCostDivisor(CostKind, BB); | ||
|
||
Cost += BlockCost; | ||
} | ||
|
@@ -5167,7 +5196,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, | |
// conditional branches, but may not be executed for each vector lane. Scale | ||
// the cost by the probability of executing the predicated block. | ||
if (isPredicatedInst(I)) { | ||
Cost /= getPredBlockCostDivisor(CostKind); | ||
Cost /= getPredBlockCostDivisor(CostKind, I->getParent()); | ||
|
||
// Add the cost of an i1 extract and a branch | ||
auto *VecI1Ty = | ||
|
@@ -6727,6 +6756,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const { | |
SkipCostComputation.contains(UI); | ||
} | ||
|
||
unsigned VPCostContext::getPredBlockCostDivisor( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this needed when the caller could just use |
||
TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) const { | ||
return CM.getPredBlockCostDivisor(CostKind, BB); | ||
} | ||
|
||
InstructionCost | ||
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, | ||
VPCostContext &CostCtx) const { | ||
|
@@ -10310,9 +10344,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F, | |
|
||
auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F); | ||
PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent()); | ||
BFI = nullptr; | ||
if (PSI && PSI->hasProfileSummary()) | ||
BFI = &AM.getResult<BlockFrequencyAnalysis>(F); | ||
BFI = &AM.getResult<BlockFrequencyAnalysis>(F); | ||
LoopVectorizeResult Result = runImpl(F); | ||
if (!Result.MadeAnyChange) | ||
return PreservedAnalyses::all(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,7 +25,7 @@ define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) { | |
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP17:%.*]], [[PRED_UDIV_CONTINUE2]] ] | ||
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] | ||
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 4 | ||
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer | ||
; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], splat (i64 1) | ||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 | ||
; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] | ||
; CHECK: pred.udiv.if: | ||
|
@@ -65,7 +65,7 @@ for.body: | |
%r = phi i64 [ 0, %entry ], [ %var6, %for.inc ] | ||
%var0 = getelementptr inbounds i64, ptr %a, i64 %i | ||
%var2 = load i64, ptr %var0, align 4 | ||
%cond0 = icmp sgt i64 %var2, 0 | ||
%cond0 = icmp sgt i64 %var2, 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This constant was changed to keep the old branch probability the same and keep the block scalarized, since with BFI |
||
br i1 %cond0, label %if.then, label %for.inc | ||
|
||
if.then: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) { | |
; | ||
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store( | ||
; COMMON-SAME: ptr [[DST:%.*]]) { | ||
; COMMON-NEXT: [[ENTRY:.*:]] | ||
; COMMON-NEXT: br label %[[VECTOR_PH:.*]] | ||
; COMMON: [[VECTOR_PH]]: | ||
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]] | ||
; COMMON: [[VECTOR_BODY]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] | ||
; COMMON: [[PRED_STORE_IF]]: | ||
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0 | ||
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]] | ||
; COMMON: [[PRED_STORE_CONTINUE]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]] | ||
; COMMON: [[PRED_STORE_IF1]]: | ||
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1 | ||
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]] | ||
; COMMON: [[PRED_STORE_CONTINUE2]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]] | ||
; COMMON: [[PRED_STORE_IF3]]: | ||
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2 | ||
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]] | ||
; COMMON: [[PRED_STORE_CONTINUE4]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] | ||
; COMMON: [[PRED_STORE_IF5]]: | ||
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3 | ||
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]] | ||
; COMMON: [[PRED_STORE_CONTINUE6]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] | ||
; COMMON: [[PRED_STORE_IF7]]: | ||
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4 | ||
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]] | ||
; COMMON: [[PRED_STORE_CONTINUE8]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] | ||
; COMMON: [[PRED_STORE_IF9]]: | ||
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5 | ||
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]] | ||
; COMMON: [[PRED_STORE_CONTINUE10]]: | ||
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] | ||
; COMMON: [[PRED_STORE_IF11]]: | ||
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6 | ||
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1 | ||
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]] | ||
; COMMON: [[PRED_STORE_CONTINUE12]]: | ||
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]] | ||
; COMMON: [[PRED_STORE_IF13]]: | ||
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7 | ||
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1 | ||
; COMMON-NEXT: br label %[[EXIT]] | ||
; COMMON: [[EXIT]]: | ||
; COMMON-NEXT: br label %[[SCALAR_PH:.*]] | ||
; COMMON: [[SCALAR_PH]]: | ||
; COMMON-NEXT: br [[EXIT1:label %.*]] | ||
; COMMON: [[SCALAR_PH1:.*:]] | ||
; COMMON-NEXT: [[ENTRY:.*]]: | ||
; COMMON-NEXT: br label %[[EXIT1:.*]] | ||
; COMMON: [[EXIT1]]: | ||
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ] | ||
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8 | ||
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] | ||
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1 | ||
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 | ||
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7 | ||
; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]] | ||
; COMMON: [[SCALAR_PH1]]: | ||
; COMMON-NEXT: ret void | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously all the predicated scalar stores were discounted by x0.5 in computePredInstDiscount. BFI now correctly returns that this block is always executed so the VF=1 plan is no longer discounted. |
||
; | ||
entry: | ||
br label %loop | ||
|
@@ -1241,8 +1196,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 { | |
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 | ||
; DEFAULT-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() | ||
; DEFAULT-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 | ||
; DEFAULT-NEXT: [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8) | ||
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]] | ||
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] | ||
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] | ||
; DEFAULT: [[VECTOR_MEMCHECK]]: | ||
; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,8 +57,8 @@ define i64 @same_exit_block_pre_inc_use1_nosve() { | |
; CHECK-NEXT: Cost of 48 for VF 16: EMIT vp<{{.*}}> = first-active-lane ir<%cmp3> | ||
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.*}}> = add | ||
; CHECK-NEXT: Cost of 0 for VF 16: vp<{{.*}}> = DERIVED-IV | ||
; CHECK: LV: Minimum required TC for runtime checks to be profitable:160 | ||
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 160) | ||
; CHECK: LV: Minimum required TC for runtime checks to be profitable:128 | ||
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 128) | ||
; CHECK-NEXT: LV: Too many memory checks needed. | ||
entry: | ||
%p1 = alloca [1024 x i8] | ||
|
@@ -105,7 +105,7 @@ loop.header: | |
%gep.src = getelementptr inbounds i64, ptr %src, i64 %iv | ||
%l = load i64, ptr %gep.src, align 1 | ||
%t = trunc i64 %l to i1 | ||
br i1 %t, label %exit.0, label %loop.latch | ||
br i1 %t, label %exit.0, label %loop.latch, !prof !0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added to retain the old branch probability. With BFI, the probability of exit.0 being taken is computed as 3% (why that is, I'm not sure) and the IV increment isn't discounted in the VF=1 plan. |
||
|
||
loop.latch: | ||
%iv.next = add i64 %iv, 1 | ||
|
@@ -120,4 +120,6 @@ exit.1: | |
ret i64 0 | ||
} | ||
|
||
!0 = !{!"branch_weights", i32 1, i32 1} | ||
|
||
attributes #1 = { "target-features"="+sve" vscale_range(1,16) } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You already have
HeaderFreq
andBBFreq
so I don't think you need to callgetBlockFreq
again when returning.