@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290290 cl::desc(" A flag that overrides the target's max interleave factor for "
291291 " vectorized loops." ));
292292
293- cl::opt<unsigned > ForceTargetInstructionCost (
293+ static cl::opt<unsigned > ForceTargetInstructionCost (
294294 " force-target-instruction-cost" , cl::init(0 ), cl::Hidden,
295295 cl::desc(" A flag that overrides the target's expected cost for "
296296 " an instruction to a single constant value. Mostly "
@@ -412,6 +412,14 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412412 return DL.getTypeAllocSizeInBits (Ty) != DL.getTypeSizeInBits (Ty);
413413}
414414
415+ // / A helper function that returns the reciprocal of the block probability of
416+ // / predicated blocks. If we return X, we are assuming the predicated block
417+ // / will execute once for every X iterations of the loop header.
418+ // /
419+ // / TODO: We should use actual block probability here, if available. Currently,
420+ // / we always assume predicated blocks have a 50% chance of executing.
421+ static unsigned getReciprocalPredBlockProb () { return 2 ; }
422+
415423// / Returns "best known" trip count for the specified loop \p L as defined by
416424// / the following procedure:
417425// / 1) Returns exact trip count if it is known.
@@ -1613,16 +1621,6 @@ class LoopVectorizationCostModel {
16131621 // / \p VF is the vectorization factor chosen for the original loop.
16141622 bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
16151623
1616- // / Return the cost of instructions in an inloop reduction pattern, if I is
1617- // / part of that pattern.
1618- std::optional<InstructionCost>
1619- getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy,
1620- TTI::TargetCostKind CostKind) const ;
1621-
1622- // / Returns the execution time cost of an instruction for a given vector
1623- // / width. Vector width of one means scalar.
1624- VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1625-
16261624private:
16271625 unsigned NumPredStores = 0 ;
16281626
@@ -1648,11 +1646,21 @@ class LoopVectorizationCostModel {
16481646 // / of elements.
16491647 ElementCount getMaxLegalScalableVF (unsigned MaxSafeElements);
16501648
1649+ // / Returns the execution time cost of an instruction for a given vector
1650+ // / width. Vector width of one means scalar.
1651+ VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1652+
16511653 // / The cost-computation logic from getInstructionCost which provides
16521654 // / the vector type as an output parameter.
16531655 InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
16541656 Type *&VectorTy);
16551657
1658+ // / Return the cost of instructions in an inloop reduction pattern, if I is
1659+ // / part of that pattern.
1660+ std::optional<InstructionCost>
1661+ getReductionPatternCost (Instruction *I, ElementCount VF, Type *VectorTy,
1662+ TTI::TargetCostKind CostKind) const ;
1663+
16561664 // / Calculate vectorization cost of memory instruction \p I.
16571665 InstructionCost getMemoryInstructionCost (Instruction *I, ElementCount VF);
16581666
@@ -7289,10 +7297,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72897297 if (!MaxFactors.hasVector ())
72907298 return VectorizationFactor::Disabled ();
72917299
7292- // Select the optimal vectorization factor according to the legacy cost-model.
7293- // This is now only used to verify the decisions by the new VPlan-based
7294- // cost-model and will be retired once the VPlan-based cost-model is
7295- // stabilized.
7300+ // Select the optimal vectorization factor.
72967301 VectorizationFactor VF = selectVectorizationFactor (VFCandidates);
72977302 assert ((VF.Width .isScalar () || VF.ScalarCost > 0 ) && " when vectorizing, the scalar cost must be non-zero." );
72987303 if (!hasPlanWithVF (VF.Width )) {
@@ -7303,189 +7308,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
73037308 return VF;
73047309}
73057310
7306- InstructionCost VPCostContext::getLegacyCost (Instruction *UI,
7307- ElementCount VF) const {
7308- return CM.getInstructionCost (UI, VF).first ;
7309- }
7310-
7311- bool VPCostContext::skipCostComputation (Instruction *UI, bool IsVector) const {
7312- return (IsVector && CM.VecValuesToIgnore .contains (UI)) ||
7313- SkipCostComputation.contains (UI);
7314- }
7315-
7316- InstructionCost LoopVectorizationPlanner::cost (VPlan &Plan,
7317- ElementCount VF) const {
7318- InstructionCost Cost = 0 ;
7319- LLVMContext &LLVMCtx = OrigLoop->getHeader ()->getContext ();
7320- VPCostContext CostCtx (CM.TTI , Legal->getWidestInductionType (), LLVMCtx, CM);
7321-
7322- // Cost modeling for inductions is inaccurate in the legacy cost model
7323- // compared to the recipes that are generated. To match here initially during
7324- // VPlan cost model bring up directly use the induction costs from the legacy
7325- // cost model. Note that we do this as pre-processing; the VPlan may not have
7326- // any recipes associated with the original induction increment instruction
7327- // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7328- // the cost of induction phis and increments (both that are represented by
7329- // recipes and those that are not), to avoid distinguishing between them here,
7330- // and skip all recipes that represent induction phis and increments (the
7331- // former case) later on, if they exist, to avoid counting them twice.
7332- // Similarly we pre-compute the cost of any optimized truncates.
7333- // TODO: Switch to more accurate costing based on VPlan.
7334- for (const auto &[IV, IndDesc] : Legal->getInductionVars ()) {
7335- Instruction *IVInc = cast<Instruction>(
7336- IV->getIncomingValueForBlock (OrigLoop->getLoopLatch ()));
7337- SmallVector<Instruction *> IVInsts = {IV, IVInc};
7338- for (User *U : IV->users ()) {
7339- auto *CI = cast<Instruction>(U);
7340- if (!CostCtx.CM .isOptimizableIVTruncate (CI, VF))
7341- continue ;
7342- IVInsts.push_back (CI);
7343- }
7344- for (Instruction *IVInst : IVInsts) {
7345- if (!CostCtx.SkipCostComputation .insert (IVInst).second )
7346- continue ;
7347- InstructionCost InductionCost = CostCtx.getLegacyCost (IVInst, VF);
7348- LLVM_DEBUG ({
7349- dbgs () << " Cost of " << InductionCost << " for VF " << VF
7350- << " : induction instruction " << *IVInst << " \n " ;
7351- });
7352- Cost += InductionCost;
7353- }
7354- }
7355-
7356- // / Compute the cost of all exiting conditions of the loop using the legacy
7357- // / cost model. This is to match the legacy behavior, which adds the cost of
7358- // / all exit conditions. Note that this over-estimates the cost, as there will
7359- // / be a single condition to control the vector loop.
7360- SmallVector<BasicBlock *> Exiting;
7361- CM.TheLoop ->getExitingBlocks (Exiting);
7362- SetVector<Instruction *> ExitInstrs;
7363- // Collect all exit conditions.
7364- for (BasicBlock *EB : Exiting) {
7365- auto *Term = dyn_cast<BranchInst>(EB->getTerminator ());
7366- if (!Term)
7367- continue ;
7368- if (auto *CondI = dyn_cast<Instruction>(Term->getOperand (0 ))) {
7369- ExitInstrs.insert (CondI);
7370- }
7371- }
7372- // Compute the cost of all instructions only feeding the exit conditions.
7373- for (unsigned I = 0 ; I != ExitInstrs.size (); ++I) {
7374- Instruction *CondI = ExitInstrs[I];
7375- if (!OrigLoop->contains (CondI) ||
7376- !CostCtx.SkipCostComputation .insert (CondI).second )
7377- continue ;
7378- Cost += CostCtx.getLegacyCost (CondI, VF);
7379- for (Value *Op : CondI->operands ()) {
7380- auto *OpI = dyn_cast<Instruction>(Op);
7381- if (!OpI || any_of (OpI->users (), [&ExitInstrs](User *U) {
7382- return !ExitInstrs.contains (cast<Instruction>(U));
7383- }))
7384- continue ;
7385- ExitInstrs.insert (OpI);
7386- }
7387- }
7388-
7389- // The legacy cost model has special logic to compute the cost of in-loop
7390- // reductions, which may be smaller than the sum of all instructions involved
7391- // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7392- // which the legacy cost model uses to assign cost. Pre-compute their costs
7393- // for now.
7394- // TODO: Switch to costing based on VPlan once the logic has been ported.
7395- for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars ()) {
7396- if (!CM.isInLoopReduction (RedPhi) &&
7397- !RecurrenceDescriptor::isAnyOfRecurrenceKind (
7398- RdxDesc.getRecurrenceKind ()))
7399- continue ;
7400-
7401- // AnyOf reduction codegen may remove the select. To match the legacy cost
7402- // model, pre-compute the cost for AnyOf reductions here.
7403- if (RecurrenceDescriptor::isAnyOfRecurrenceKind (
7404- RdxDesc.getRecurrenceKind ())) {
7405- auto *Select = cast<SelectInst>(*find_if (
7406- RedPhi->users (), [](User *U) { return isa<SelectInst>(U); }));
7407- assert (!CostCtx.SkipCostComputation .contains (Select) &&
7408- " reduction op visited multiple times" );
7409- CostCtx.SkipCostComputation .insert (Select);
7410- auto ReductionCost = CostCtx.getLegacyCost (Select, VF);
7411- LLVM_DEBUG (dbgs () << " Cost of " << ReductionCost << " for VF " << VF
7412- << " :\n any-of reduction " << *Select << " \n " );
7413- Cost += ReductionCost;
7414- continue ;
7415- }
7416-
7417- const auto &ChainOps = RdxDesc.getReductionOpChain (RedPhi, OrigLoop);
7418- SetVector<Instruction *> ChainOpsAndOperands (ChainOps.begin (),
7419- ChainOps.end ());
7420- // Also include the operands of instructions in the chain, as the cost-model
7421- // may mark extends as free.
7422- for (auto *ChainOp : ChainOps) {
7423- for (Value *Op : ChainOp->operands ()) {
7424- if (auto *I = dyn_cast<Instruction>(Op))
7425- ChainOpsAndOperands.insert (I);
7426- }
7427- }
7428-
7429- // Pre-compute the cost for I, if it has a reduction pattern cost.
7430- for (Instruction *I : ChainOpsAndOperands) {
7431- auto ReductionCost = CM.getReductionPatternCost (
7432- I, VF, ToVectorTy (I->getType (), VF), TTI::TCK_RecipThroughput);
7433- if (!ReductionCost)
7434- continue ;
7435-
7436- assert (!CostCtx.SkipCostComputation .contains (I) &&
7437- " reduction op visited multiple times" );
7438- CostCtx.SkipCostComputation .insert (I);
7439- LLVM_DEBUG (dbgs () << " Cost of " << ReductionCost << " for VF " << VF
7440- << " :\n in-loop reduction " << *I << " \n " );
7441- Cost += *ReductionCost;
7442- }
7443- }
7444-
7445- // Now compute and add the VPlan-based cost.
7446- Cost += Plan.cost (VF, CostCtx);
7447- LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost << " \n " );
7448- return Cost;
7449- }
7450-
7451- VPlan &LoopVectorizationPlanner::getBestPlan () const {
7452- // If there is a single VPlan with a single VF, return it directly.
7453- VPlan &FirstPlan = *VPlans[0 ];
7454- if (VPlans.size () == 1 && size (FirstPlan.vectorFactors ()) == 1 )
7455- return FirstPlan;
7456-
7457- VPlan *BestPlan = &FirstPlan;
7458- ElementCount ScalarVF = ElementCount::getFixed (1 );
7459- assert (hasPlanWithVF (ScalarVF) &&
7460- " More than a single plan/VF w/o any plan having scalar VF" );
7461-
7462- InstructionCost ScalarCost = cost (getBestPlanFor (ScalarVF), ScalarVF);
7463- VectorizationFactor BestFactor (ScalarVF, ScalarCost, ScalarCost);
7464-
7465- bool ForceVectorization = Hints.getForce () == LoopVectorizeHints::FK_Enabled;
7466- if (ForceVectorization) {
7467- // Ignore scalar width, because the user explicitly wants vectorization.
7468- // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7469- // evaluation.
7470- BestFactor.Cost = InstructionCost::getMax ();
7471- }
7472-
7473- for (auto &P : VPlans) {
7474- for (ElementCount VF : P->vectorFactors ()) {
7475- if (VF.isScalar ())
7476- continue ;
7477- InstructionCost Cost = cost (*P, VF);
7478- VectorizationFactor CurrentFactor (VF, Cost, ScalarCost);
7479- if (isMoreProfitable (CurrentFactor, BestFactor)) {
7480- BestFactor = CurrentFactor;
7481- BestPlan = &*P;
7482- }
7483- }
7484- }
7485- BestPlan->setVF (BestFactor.Width );
7486- return *BestPlan;
7487- }
7488-
74897311VPlan &LoopVectorizationPlanner::getBestPlanFor (ElementCount VF) const {
74907312 assert (count_if (VPlans,
74917313 [VF](const VPlanPtr &Plan) { return Plan->hasVF (VF); }) ==
@@ -10344,15 +10166,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1034410166 VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
1034510167 PSI, Checks);
1034610168
10347- VPlan &BestPlan = LVP.getBestPlan ();
10348- assert (size (BestPlan.vectorFactors ()) == 1 &&
10349- " Plan should have a single VF" );
10350- ElementCount Width = *BestPlan.vectorFactors ().begin ();
10351- LLVM_DEBUG (dbgs () << " VF picked by VPlan cost model: " << Width
10352- << " \n " );
10353- assert (VF.Width == Width &&
10354- " VPlan cost model and legacy cost model disagreed" );
10355- LVP.executePlan (Width, IC, BestPlan, LB, DT, false );
10169+ VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
10170+ LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
1035610171 ++LoopsVectorized;
1035710172
1035810173 // Add metadata to disable runtime unrolling a scalar loop when there
0 commit comments