146146#include " llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147147#include < algorithm>
148148#include < cassert>
149- #include < cmath>
150149#include < cstdint>
151150#include < functional>
152151#include < iterator>
@@ -874,14 +873,12 @@ class LoopVectorizationCostModel {
874873 const TargetTransformInfo &TTI,
875874 const TargetLibraryInfo *TLI, DemandedBits *DB,
876875 AssumptionCache *AC,
877- OptimizationRemarkEmitter *ORE,
878- std::function<BlockFrequencyInfo &()> GetBFI,
879- const Function *F, const LoopVectorizeHints *Hints,
876+ OptimizationRemarkEmitter *ORE, const Function *F,
877+ const LoopVectorizeHints *Hints,
880878 InterleavedAccessInfo &IAI, bool OptForSize)
881879 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
882- TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
883- TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
884- OptForSize(OptForSize) {
880+ TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
881+ Hints(Hints), InterleaveInfo(IAI), OptForSize(OptForSize) {
885882 if (TTI.supportsScalableVectors () || ForceTargetSupportsScalableVectors)
886883 initializeVScaleForTuning ();
887884 CostKind = F->hasMinSize () ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
@@ -1222,7 +1219,7 @@ class LoopVectorizationCostModel {
12221219 // / for which our chosen predication strategy is scalarization (i.e. we
12231220 // / don't have an alternate strategy such as masking available).
12241221 // / \p VF is the vectorization factor that will be used to vectorize \p I.
1225- bool isScalarWithPredication (Instruction *I, ElementCount VF);
1222+ bool isScalarWithPredication (Instruction *I, ElementCount VF) const ;
12261223
12271224 // / Returns true if \p I is an instruction that needs to be predicated
12281225 // / at runtime. The result is independent of the predication mechanism.
@@ -1237,19 +1234,29 @@ class LoopVectorizationCostModel {
12371234 // / optimizing for code size it will just be 1 as code size costs don't depend
12381235 // / on execution probabilities.
12391236 // /
1240- // / Note that if a block wasn't originally predicated but was predicated due
1241- // / to tail folding, the divisor will still be 1 because it will execute for
1242- // / every iteration of the loop header .
1237+ // / TODO: We should use actual block probability here, if available.
1238+ // / Currently, we always assume predicated blocks have a 50% chance of
1239+ // / executing, apart from blocks that are only predicated due to tail folding .
12431240 inline unsigned
12441241 getPredBlockCostDivisor (TargetTransformInfo::TargetCostKind CostKind,
1245- const BasicBlock *BB);
1242+ BasicBlock *BB) const {
1243+ // If a block wasn't originally predicated but was predicated due to
1244+ // e.g. tail folding, don't divide the cost. Tail folded loops may still be
1245+ // predicated in the final vector loop iteration, but for most loops that
1246+ // don't have low trip counts we can expect their probability to be close to
1247+ // zero.
1248+ if (!Legal->blockNeedsPredication (BB))
1249+ return 1 ;
1250+ return CostKind == TTI::TCK_CodeSize ? 1 : 2 ;
1251+ }
12461252
12471253 // / Return the costs for our two available strategies for lowering a
12481254 // / div/rem operation which requires speculating at least one lane.
12491255 // / First result is for scalarization (will be invalid for scalable
12501256 // / vectors); second is for the safe-divisor strategy.
12511257 std::pair<InstructionCost, InstructionCost>
1252- getDivRemSpeculationCost (Instruction *I, ElementCount VF);
1258+ getDivRemSpeculationCost (Instruction *I,
1259+ ElementCount VF) const ;
12531260
12541261 // / Returns true if \p I is a memory instruction with consecutive memory
12551262 // / access that can be widened.
@@ -1722,20 +1729,6 @@ class LoopVectorizationCostModel {
17221729 // / Interface to emit optimization remarks.
17231730 OptimizationRemarkEmitter *ORE;
17241731
1725- // / A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1726- // / unless necessary, e.g. when the loop isn't legal to vectorize or when
1727- // / there is no predication.
1728- std::function<BlockFrequencyInfo &()> GetBFI;
1729- // / The BlockFrequencyInfo returned from GetBFI.
1730- BlockFrequencyInfo *BFI = nullptr ;
1731- // / Returns the BlockFrequencyInfo for the function if cached, otherwise
1732- // / fetches it via GetBFI. Avoids an indirect call to the std::function.
1733- BlockFrequencyInfo &getBFI () {
1734- if (!BFI)
1735- BFI = &GetBFI ();
1736- return *BFI;
1737- }
1738-
17391732 const Function *TheFunction;
17401733
17411734 // / Loop Vectorize Hint.
@@ -2799,8 +2792,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
27992792 Scalars[VF].insert_range (Worklist);
28002793}
28012794
2802- bool LoopVectorizationCostModel::isScalarWithPredication (Instruction *I,
2803- ElementCount VF) {
2795+ bool LoopVectorizationCostModel::isScalarWithPredication (
2796+ Instruction *I, ElementCount VF) const {
28042797 if (!isPredicatedInst (I))
28052798 return false ;
28062799
@@ -2893,26 +2886,9 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
28932886 }
28942887}
28952888
2896- unsigned LoopVectorizationCostModel::getPredBlockCostDivisor (
2897- TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2898- if (CostKind == TTI::TCK_CodeSize)
2899- return 1 ;
2900- // If the block wasn't originally predicated then return early to avoid
2901- // computing BlockFrequencyInfo unnecessarily.
2902- if (!Legal->blockNeedsPredication (BB))
2903- return 1 ;
2904-
2905- uint64_t HeaderFreq =
2906- getBFI ().getBlockFreq (TheLoop->getHeader ()).getFrequency ();
2907- uint64_t BBFreq = getBFI ().getBlockFreq (BB).getFrequency ();
2908- assert (HeaderFreq >= BBFreq &&
2909- " Header has smaller block freq than dominated BB?" );
2910- return std::round ((double )HeaderFreq / BBFreq);
2911- }
2912-
29132889std::pair<InstructionCost, InstructionCost>
29142890LoopVectorizationCostModel::getDivRemSpeculationCost (Instruction *I,
2915- ElementCount VF) {
2891+ ElementCount VF) const {
29162892 assert (I->getOpcode () == Instruction::UDiv ||
29172893 I->getOpcode () == Instruction::SDiv ||
29182894 I->getOpcode () == Instruction::SRem ||
@@ -9206,9 +9182,8 @@ static bool processLoopInVPlanNativePath(
92069182 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
92079183 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
92089184 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9209- OptimizationRemarkEmitter *ORE,
9210- std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
9211- LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
9185+ OptimizationRemarkEmitter *ORE, bool OptForSize, LoopVectorizeHints &Hints,
9186+ LoopVectorizationRequirements &Requirements) {
92129187
92139188 if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount ())) {
92149189 LLVM_DEBUG (dbgs () << " LV: cannot compute the outer-loop trip count\n " );
@@ -9221,8 +9196,8 @@ static bool processLoopInVPlanNativePath(
92219196 ScalarEpilogueLowering SEL =
92229197 getScalarEpilogueLowering (F, L, Hints, OptForSize, TTI, TLI, *LVL, &IAI);
92239198
9224- LoopVectorizationCostModel CM (SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
9225- GetBFI, F, &Hints, IAI, OptForSize);
9199+ LoopVectorizationCostModel CM (SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9200+ &Hints, IAI, OptForSize);
92269201 // Use the planner for outer loop vectorization.
92279202 // TODO: CM is not used at this point inside the planner. Turn CM into an
92289203 // optional argument if we don't need it in the future.
@@ -9922,10 +9897,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99229897
99239898 // Query this against the original loop and save it here because the profile
99249899 // of the original loop header may change as the transformation happens.
9925- bool OptForSize = llvm::shouldOptimizeForSize (
9926- L->getHeader (), PSI,
9927- PSI && PSI->hasProfileSummary () ? &GetBFI () : nullptr ,
9928- PGSOQueryType::IRPass);
9900+ bool OptForSize = llvm::shouldOptimizeForSize (L->getHeader (), PSI, BFI,
9901+ PGSOQueryType::IRPass);
99299902
99309903 // Check if it is legal to vectorize the loop.
99319904 LoopVectorizationRequirements Requirements;
@@ -9959,8 +9932,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
99599932 // pipeline.
99609933 if (!L->isInnermost ())
99619934 return processLoopInVPlanNativePath (L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9962- ORE, GetBFI, OptForSize, Hints,
9963- Requirements);
9935+ ORE, OptForSize, Hints, Requirements);
99649936
99659937 assert (L->isInnermost () && " Inner loop expected." );
99669938
@@ -10063,7 +10035,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1006310035
1006410036 // Use the cost model.
1006510037 LoopVectorizationCostModel CM (SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10066- GetBFI, F, &Hints, IAI, OptForSize);
10038+ F, &Hints, IAI, OptForSize);
1006710039 // Use the planner for vectorization.
1006810040 LoopVectorizationPlanner LVP (L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
1006910041 ORE);
@@ -10381,9 +10353,9 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
1038110353
1038210354 auto &MAMProxy = AM.getResult <ModuleAnalysisManagerFunctionProxy>(F);
1038310355 PSI = MAMProxy.getCachedResult <ProfileSummaryAnalysis>(*F.getParent ());
10384- GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
10385- return AM. getResult <BlockFrequencyAnalysis>(F);
10386- } ;
10356+ BFI = nullptr ;
10357+ if (PSI && PSI-> hasProfileSummary ())
10358+ BFI = &AM. getResult <BlockFrequencyAnalysis>(F) ;
1038710359 LoopVectorizeResult Result = runImpl (F);
1038810360 if (!Result.MadeAnyChange )
1038910361 return PreservedAnalyses::all ();
0 commit comments