@@ -995,7 +995,8 @@ class LoopVectorizationCostModel {
995995 // / If interleave count has been specified by metadata it will be returned.
996996 // / Otherwise, the interleave count is computed and returned. VF and LoopCost
997997 // / are the selected vectorization factor and the cost of the selected VF.
998- unsigned selectInterleaveCount (ElementCount VF, InstructionCost LoopCost);
998+ unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
999+ InstructionCost LoopCost);
9991000
10001001 // / Memory access instruction may be vectorized in more than one way.
10011002 // / Form of instruction after vectorization depends on cost.
@@ -4850,8 +4851,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48504851 }
48514852}
48524853
4854+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4855+ // / by calculating the highest number of values that are live at a single
4856+ // / location as a rough estimate. Returns the register usage for each VF in \p
4857+ // / VFs.
4858+ static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4859+ calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4860+ const TargetTransformInfo &TTI,
4861+ const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4862+ // Each 'key' in the map opens a new interval. The values
4863+ // of the map are the index of the 'last seen' usage of the
4864+ // recipe that is the key.
4865+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4866+
4867+ // Maps indices to recipes.
4868+ SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4869+ // Marks the end of each interval.
4870+ IntervalMap EndPoint;
4871+ // Saves the list of recipe indices that are used in the loop.
4872+ SmallPtrSet<VPRecipeBase *, 8 > Ends;
4873+ // Saves the list of values that are used in the loop but are defined outside
4874+ // the loop (not including non-recipe values such as arguments and
4875+ // constants).
4876+ SmallSetVector<VPValue *, 8 > LoopInvariants;
4877+ LoopInvariants.insert (&Plan.getVectorTripCount ());
4878+
4879+ // We scan the loop in a topological order in order and assign a number to
4880+ // each recipe. We use RPO to ensure that defs are met before their users. We
4881+ // assume that each recipe that has in-loop users starts an interval. We
4882+ // record every time that an in-loop value is used, so we have a list of the
4883+ // first and last occurrences of each recipe.
4884+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4885+ Plan.getVectorLoopRegion ());
4886+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4887+ if (!VPBB->getParent ())
4888+ break ;
4889+ for (VPRecipeBase &R : *VPBB) {
4890+ Idx2Recipe.push_back (&R);
4891+
4892+ // Save the end location of each USE.
4893+ for (VPValue *U : R.operands ()) {
4894+ auto *DefR = U->getDefiningRecipe ();
4895+
4896+ // Ignore non-recipe values such as arguments, constants, etc.
4897+ // FIXME: Might need some motivation why these values are ignored. If
4898+ // for example an argument is used inside the loop it will increase the
4899+ // register pressure (so shouldn't we add it to LoopInvariants).
4900+ if (!DefR && (!U->getLiveInIRValue () ||
4901+ !isa<Instruction>(U->getLiveInIRValue ())))
4902+ continue ;
4903+
4904+ // If this recipe is outside the loop then record it and continue.
4905+ if (!DefR) {
4906+ LoopInvariants.insert (U);
4907+ continue ;
4908+ }
4909+
4910+ // Overwrite previous end points.
4911+ EndPoint[DefR] = Idx2Recipe.size ();
4912+ Ends.insert (DefR);
4913+ }
4914+ }
4915+ if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4916+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4917+ // exiting block, where their increment will get materialized eventually.
4918+ for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4919+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4920+ EndPoint[&R] = Idx2Recipe.size ();
4921+ Ends.insert (&R);
4922+ }
4923+ }
4924+ }
4925+ }
4926+
4927+ // Saves the list of intervals that end with the index in 'key'.
4928+ using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4929+ SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4930+
4931+ // Next, we transpose the EndPoints into a multi map that holds the list of
4932+ // intervals that *end* at a specific location.
4933+ for (auto &Interval : EndPoint)
4934+ TransposeEnds[Interval.second ].push_back (Interval.first );
4935+
4936+ SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4937+ SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4938+ SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4939+
4940+ LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4941+
4942+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4943+
4944+ const auto &TTICapture = TTI;
4945+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4946+ if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4947+ (VF.isScalable () &&
4948+ !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4949+ return 0 ;
4950+ return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4951+ };
4952+
4953+ // We scan the instructions linearly and record each time that a new interval
4954+ // starts, by placing it in a set. If we find this value in TransposEnds then
4955+ // we remove it from the set. The max register usage is the maximum register
4956+ // usage of the recipes of the set.
4957+ for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4958+ VPRecipeBase *R = Idx2Recipe[Idx];
4959+
4960+ // Remove all of the recipes that end at this location.
4961+ RecipeList &List = TransposeEnds[Idx];
4962+ for (VPRecipeBase *ToRemove : List)
4963+ OpenIntervals.erase (ToRemove);
4964+
4965+ // Ignore recipes that are never used within the loop and do not have side
4966+ // effects.
4967+ if (!Ends.count (R) && !R->mayHaveSideEffects ())
4968+ continue ;
4969+
4970+ // Skip recipes for ignored values.
4971+ // TODO: Should mark recipes for ephemeral values that cannot be removed
4972+ // explictly in VPlan.
4973+ if (isa<VPSingleDefRecipe>(R) &&
4974+ ValuesToIgnore.contains (
4975+ cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4976+ continue ;
4977+
4978+ // For each VF find the maximum usage of registers.
4979+ for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
4980+ // Count the number of registers used, per register class, given all open
4981+ // intervals.
4982+ // Note that elements in this SmallMapVector will be default constructed
4983+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
4984+ // there is no previous entry for ClassID.
4985+ SmallMapVector<unsigned , unsigned , 4 > RegUsage;
4986+
4987+ for (auto *R : OpenIntervals) {
4988+ // Skip recipes that weren't present in the original loop.
4989+ // TODO: Remove after removing the legacy
4990+ // LoopVectorizationCostModel::calculateRegisterUsage
4991+ if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
4992+ VPBranchOnMaskRecipe>(R))
4993+ continue ;
4994+
4995+ if (VFs[J].isScalar () ||
4996+ isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
4997+ VPScalarIVStepsRecipe>(R) ||
4998+ (isa<VPInstruction>(R) &&
4999+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5000+ return cast<VPRecipeBase>(U)->usesScalars (R->getVPSingleValue ());
5001+ }))) {
5002+ unsigned ClassID = TTI.getRegisterClassForType (
5003+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5004+ // FIXME: The target might use more than one register for the type
5005+ // even in the scalar case.
5006+ RegUsage[ClassID] += 1 ;
5007+ } else {
5008+ for (VPValue *DefV : R->definedValues ()) {
5009+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5010+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5011+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5012+ }
5013+ }
5014+ }
5015+
5016+ for (const auto &Pair : RegUsage) {
5017+ auto &Entry = MaxUsages[J][Pair.first ];
5018+ Entry = std::max (Entry, Pair.second );
5019+ }
5020+ }
5021+
5022+ LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5023+ << OpenIntervals.size () << ' \n ' );
5024+
5025+ // Add the current recipe to the list of open intervals.
5026+ OpenIntervals.insert (R);
5027+ }
5028+
5029+ // We also search for instructions that are defined outside the loop, but are
5030+ // used inside the loop. We need this number separately from the max-interval
5031+ // usage number because when we unroll, loop-invariant values do not take
5032+ // more register.
5033+ LoopVectorizationCostModel::RegisterUsage RU;
5034+ for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5035+ // Note that elements in this SmallMapVector will be default constructed
5036+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5037+ // there is no previous entry for ClassID.
5038+ SmallMapVector<unsigned , unsigned , 4 > Invariant;
5039+
5040+ for (auto *In : LoopInvariants) {
5041+ // FIXME: The target might use more than one register for the type
5042+ // even in the scalar case.
5043+ bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5044+ return cast<VPRecipeBase>(U)->usesScalars (In);
5045+ });
5046+
5047+ ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5048+ unsigned ClassID = TTI.getRegisterClassForType (
5049+ VF.isVector (), TypeInfo.inferScalarType (In));
5050+ Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5051+ }
5052+
5053+ LLVM_DEBUG ({
5054+ dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5055+ dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5056+ << " item\n " ;
5057+ for (const auto &pair : MaxUsages[Idx]) {
5058+ dbgs () << " LV(REG): RegisterClass: "
5059+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5060+ << " registers\n " ;
5061+ }
5062+ dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5063+ << " item\n " ;
5064+ for (const auto &pair : Invariant) {
5065+ dbgs () << " LV(REG): RegisterClass: "
5066+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5067+ << " registers\n " ;
5068+ }
5069+ });
5070+
5071+ RU.LoopInvariantRegs = Invariant;
5072+ RU.MaxLocalUsers = MaxUsages[Idx];
5073+ RUs[Idx] = RU;
5074+ }
5075+
5076+ return RUs;
5077+ }
5078+
48535079unsigned
4854- LoopVectorizationCostModel::selectInterleaveCount (ElementCount VF,
5080+ LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
48555081 InstructionCost LoopCost) {
48565082 // -- The interleave heuristics --
48575083 // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4901,7 +5127,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49015127 return 1 ;
49025128 }
49035129
4904- RegisterUsage R = calculateRegisterUsage ({VF})[0 ];
5130+ RegisterUsage R =
5131+ ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
49055132 // We divide by these constants so assume that we have at least one
49065133 // instruction that uses at least one register.
49075134 for (auto &Pair : R.MaxLocalUsers ) {
@@ -5152,7 +5379,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
51525379 // We also search for instructions that are defined outside the loop, but are
51535380 // used inside the loop. We need this number separately from the max-interval
51545381 // usage number because when we unroll, loop-invariant values do not take
5155- // more register .
5382+ // more registers .
51565383 LoopBlocksDFS DFS (TheLoop);
51575384 DFS.perform (LI);
51585385
@@ -10657,7 +10884,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1065710884 AddBranchWeights, CM.CostKind );
1065810885 if (LVP.hasPlanWithVF (VF.Width )) {
1065910886 // Select the interleave count.
10660- IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
10887+ IC = CM.selectInterleaveCount (LVP. getPlanFor (VF. Width ), VF.Width , VF.Cost );
1066110888
1066210889 unsigned SelectedIC = std::max (IC, UserIC);
1066310890 // Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments