@@ -992,7 +992,8 @@ class LoopVectorizationCostModel {
992992 // / If interleave count has been specified by metadata it will be returned.
993993 // / Otherwise, the interleave count is computed and returned. VF and LoopCost
994994 // / are the selected vectorization factor and the cost of the selected VF.
995- unsigned selectInterleaveCount (ElementCount VF, InstructionCost LoopCost);
995+ unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
996+ InstructionCost LoopCost);
996997
997998 // / Memory access instruction may be vectorized in more than one way.
998999 // / Form of instruction after vectorization depends on cost.
@@ -4873,8 +4874,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48734874 }
48744875}
48754876
4877+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4878+ // / by calculating the highest number of values that are live at a single
4879+ // / location as a rough estimate. Returns the register usage for each VF in \p
4880+ // / VFs.
4881+ static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4882+ calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4883+ const TargetTransformInfo &TTI,
4884+ const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4885+ // Each 'key' in the map opens a new interval. The values
4886+ // of the map are the index of the 'last seen' usage of the
4887+ // recipe that is the key.
4888+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4889+
4890+ // Maps indices to recipes.
4891+ SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4892+ // Marks the end of each interval.
4893+ IntervalMap EndPoint;
4894+ // Saves the list of recipe indices that are used in the loop.
4895+ SmallPtrSet<VPRecipeBase *, 8 > Ends;
4896+ // Saves the list of values that are used in the loop but are defined outside
4897+ // the loop (not including non-recipe values such as arguments and
4898+ // constants).
4899+ SmallSetVector<VPValue *, 8 > LoopInvariants;
4900+ LoopInvariants.insert (&Plan.getVectorTripCount ());
4901+
4902+ // We scan the loop in a topological order in order and assign a number to
4903+ // each recipe. We use RPO to ensure that defs are met before their users. We
4904+ // assume that each recipe that has in-loop users starts an interval. We
4905+ // record every time that an in-loop value is used, so we have a list of the
4906+ // first and last occurrences of each recipe.
4907+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4908+ Plan.getVectorLoopRegion ());
4909+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4910+ if (!VPBB->getParent ())
4911+ break ;
4912+ for (VPRecipeBase &R : *VPBB) {
4913+ Idx2Recipe.push_back (&R);
4914+
4915+ // Save the end location of each USE.
4916+ for (VPValue *U : R.operands ()) {
4917+ auto *DefR = U->getDefiningRecipe ();
4918+
4919+ // Ignore non-recipe values such as arguments, constants, etc.
4920+ // FIXME: Might need some motivation why these values are ignored. If
4921+ // for example an argument is used inside the loop it will increase the
4922+ // register pressure (so shouldn't we add it to LoopInvariants).
4923+ if (!DefR && (!U->getLiveInIRValue () ||
4924+ !isa<Instruction>(U->getLiveInIRValue ())))
4925+ continue ;
4926+
4927+ // If this recipe is outside the loop then record it and continue.
4928+ if (!DefR) {
4929+ LoopInvariants.insert (U);
4930+ continue ;
4931+ }
4932+
4933+ // Overwrite previous end points.
4934+ EndPoint[DefR] = Idx2Recipe.size ();
4935+ Ends.insert (DefR);
4936+ }
4937+ }
4938+ if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4939+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4940+ // exiting block, where their increment will get materialized eventually.
4941+ for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4942+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4943+ EndPoint[&R] = Idx2Recipe.size ();
4944+ Ends.insert (&R);
4945+ }
4946+ }
4947+ }
4948+ }
4949+
4950+ // Saves the list of intervals that end with the index in 'key'.
4951+ using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4952+ SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4953+
4954+ // Next, we transpose the EndPoints into a multi map that holds the list of
4955+ // intervals that *end* at a specific location.
4956+ for (auto &Interval : EndPoint)
4957+ TransposeEnds[Interval.second ].push_back (Interval.first );
4958+
4959+ SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4960+ SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4961+ SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4962+
4963+ LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4964+
4965+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4966+
4967+ const auto &TTICapture = TTI;
4968+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4969+ if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4970+ (VF.isScalable () &&
4971+ !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4972+ return 0 ;
4973+ return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4974+ };
4975+
4976+ // We scan the instructions linearly and record each time that a new interval
4977+ // starts, by placing it in a set. If we find this value in TransposEnds then
4978+ // we remove it from the set. The max register usage is the maximum register
4979+ // usage of the recipes of the set.
4980+ for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4981+ VPRecipeBase *R = Idx2Recipe[Idx];
4982+
4983+ // Remove all of the recipes that end at this location.
4984+ RecipeList &List = TransposeEnds[Idx];
4985+ for (VPRecipeBase *ToRemove : List)
4986+ OpenIntervals.erase (ToRemove);
4987+
4988+ // Ignore recipes that are never used within the loop and do not have side
4989+ // effects.
4990+ if (!Ends.count (R) && !R->mayHaveSideEffects ())
4991+ continue ;
4992+
4993+ // Skip recipes for ignored values.
4994+ // TODO: Should mark recipes for ephemeral values that cannot be removed
4995+ // explictly in VPlan.
4996+ if (isa<VPSingleDefRecipe>(R) &&
4997+ ValuesToIgnore.contains (
4998+ cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4999+ continue ;
5000+
5001+ // For each VF find the maximum usage of registers.
5002+ for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5003+ // Count the number of registers used, per register class, given all open
5004+ // intervals.
5005+ // Note that elements in this SmallMapVector will be default constructed
5006+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5007+ // there is no previous entry for ClassID.
5008+ SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5009+
5010+ for (auto *R : OpenIntervals) {
5011+ // Skip recipes that weren't present in the original loop.
5012+ // TODO: Remove after removing the legacy
5013+ // LoopVectorizationCostModel::calculateRegisterUsage
5014+ if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5015+ VPBranchOnMaskRecipe>(R))
5016+ continue ;
5017+
5018+ if (VFs[J].isScalar () ||
5019+ isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5020+ VPScalarIVStepsRecipe>(R) ||
5021+ (isa<VPInstruction>(R) &&
5022+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5023+ return cast<VPRecipeBase>(U)->usesScalars (R->getVPSingleValue ());
5024+ }))) {
5025+ unsigned ClassID = TTI.getRegisterClassForType (
5026+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5027+ // FIXME: The target might use more than one register for the type
5028+ // even in the scalar case.
5029+ RegUsage[ClassID] += 1 ;
5030+ } else {
5031+ for (VPValue *DefV : R->definedValues ()) {
5032+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5033+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5034+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5035+ }
5036+ }
5037+ }
5038+
5039+ for (const auto &Pair : RegUsage) {
5040+ auto &Entry = MaxUsages[J][Pair.first ];
5041+ Entry = std::max (Entry, Pair.second );
5042+ }
5043+ }
5044+
5045+ LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5046+ << OpenIntervals.size () << ' \n ' );
5047+
5048+ // Add the current recipe to the list of open intervals.
5049+ OpenIntervals.insert (R);
5050+ }
5051+
5052+ // We also search for instructions that are defined outside the loop, but are
5053+ // used inside the loop. We need this number separately from the max-interval
5054+ // usage number because when we unroll, loop-invariant values do not take
5055+ // more register.
5056+ LoopVectorizationCostModel::RegisterUsage RU;
5057+ for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5058+ // Note that elements in this SmallMapVector will be default constructed
5059+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5060+ // there is no previous entry for ClassID.
5061+ SmallMapVector<unsigned , unsigned , 4 > Invariant;
5062+
5063+ for (auto *In : LoopInvariants) {
5064+ // FIXME: The target might use more than one register for the type
5065+ // even in the scalar case.
5066+ bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5067+ return cast<VPRecipeBase>(U)->usesScalars (In);
5068+ });
5069+
5070+ ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5071+ unsigned ClassID = TTI.getRegisterClassForType (
5072+ VF.isVector (), TypeInfo.inferScalarType (In));
5073+ Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5074+ }
5075+
5076+ LLVM_DEBUG ({
5077+ dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5078+ dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5079+ << " item\n " ;
5080+ for (const auto &pair : MaxUsages[Idx]) {
5081+ dbgs () << " LV(REG): RegisterClass: "
5082+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5083+ << " registers\n " ;
5084+ }
5085+ dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5086+ << " item\n " ;
5087+ for (const auto &pair : Invariant) {
5088+ dbgs () << " LV(REG): RegisterClass: "
5089+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5090+ << " registers\n " ;
5091+ }
5092+ });
5093+
5094+ RU.LoopInvariantRegs = Invariant;
5095+ RU.MaxLocalUsers = MaxUsages[Idx];
5096+ RUs[Idx] = RU;
5097+ }
5098+
5099+ return RUs;
5100+ }
5101+
48765102unsigned
4877- LoopVectorizationCostModel::selectInterleaveCount (ElementCount VF,
5103+ LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
48785104 InstructionCost LoopCost) {
48795105 // -- The interleave heuristics --
48805106 // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4924,7 +5150,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49245150 return 1 ;
49255151 }
49265152
4927- RegisterUsage R = calculateRegisterUsage ({VF})[0 ];
5153+ RegisterUsage R =
5154+ ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
49285155 // We divide by these constants so assume that we have at least one
49295156 // instruction that uses at least one register.
49305157 for (auto &Pair : R.MaxLocalUsers ) {
@@ -5175,7 +5402,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
51755402 // We also search for instructions that are defined outside the loop, but are
51765403 // used inside the loop. We need this number separately from the max-interval
51775404 // usage number because when we unroll, loop-invariant values do not take
5178- // more register .
5405+ // more registers .
51795406 LoopBlocksDFS DFS (TheLoop);
51805407 DFS.perform (LI);
51815408
@@ -10755,7 +10982,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1075510982 AddBranchWeights, CM.CostKind );
1075610983 if (LVP.hasPlanWithVF (VF.Width )) {
1075710984 // Select the interleave count.
10758- IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
10985+ IC = CM.selectInterleaveCount (LVP. getPlanFor (VF. Width ), VF.Width , VF.Cost );
1075910986
1076010987 unsigned SelectedIC = std::max (IC, UserIC);
1076110988 // Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments