@@ -1018,7 +1018,8 @@ class LoopVectorizationCostModel {
10181018 // / If interleave count has been specified by metadata it will be returned.
10191019 // / Otherwise, the interleave count is computed and returned. VF and LoopCost
10201020 // / are the selected vectorization factor and the cost of the selected VF.
1021- unsigned selectInterleaveCount (ElementCount VF, InstructionCost LoopCost);
1021+ unsigned selectInterleaveCount (VPlan &Plan, ElementCount VF,
1022+ InstructionCost LoopCost);
10221023
10231024 // / Memory access instruction may be vectorized in more than one way.
10241025 // / Form of instruction after vectorization depends on cost.
@@ -4885,8 +4886,232 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48854886 }
48864887}
48874888
4889+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs.
4890+ // / Returns the register usage for each VF in \p VFs.
4891+ static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4892+ calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4893+ const TargetTransformInfo &TTI) {
4894+ // This function calculates the register usage by measuring the highest number
4895+ // of values that are alive at a single location. Obviously, this is a very
4896+ // rough estimation. We scan the loop in a topological order in order and
4897+ // assign a number to each recipe. We use RPO to ensure that defs are
4898+ // met before their users. We assume that each recipe that has in-loop
4899+ // users starts an interval. We record every time that an in-loop value is
4900+ // used, so we have a list of the first and last occurrences of each
4901+ // recipe. Next, we transpose this data structure into a multi map that
4902+ // holds the list of intervals that *end* at a specific location. This multi
4903+ // map allows us to perform a linear search. We scan the instructions linearly
4904+ // and record each time that a new interval starts, by placing it in a set.
4905+ // If we find this value in the multi-map then we remove it from the set.
4906+ // The max register usage is the maximum size of the set.
4907+ // We also search for instructions that are defined outside the loop, but are
4908+ // used inside the loop. We need this number separately from the max-interval
4909+ // usage number because when we unroll, loop-invariant values do not take
4910+ // more register.
4911+ LoopVectorizationCostModel::RegisterUsage RU;
4912+
4913+ // Each 'key' in the map opens a new interval. The values
4914+ // of the map are the index of the 'last seen' usage of the
4915+ // recipe that is the key.
4916+ using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4917+
4918+ // Maps recipe to its index.
4919+ SmallVector<VPRecipeBase *, 64 > IdxToRecipe;
4920+ // Marks the end of each interval.
4921+ IntervalMap EndPoint;
4922+ // Saves the list of recipe indices that are used in the loop.
4923+ SmallPtrSet<VPRecipeBase *, 8 > Ends;
4924+ // Saves the list of values that are used in the loop but are defined outside
4925+ // the loop (not including non-recipe values such as arguments and
4926+ // constants).
4927+ SmallSetVector<VPValue *, 8 > LoopInvariants;
4928+ LoopInvariants.insert (&Plan.getVectorTripCount ());
4929+
4930+ ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4931+ Plan.getVectorLoopRegion ());
4932+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4933+ if (!VPBB->getParent ())
4934+ break ;
4935+ for (VPRecipeBase &R : *VPBB) {
4936+ IdxToRecipe.push_back (&R);
4937+
4938+ // Save the end location of each USE.
4939+ for (VPValue *U : R.operands ()) {
4940+ auto *DefR = U->getDefiningRecipe ();
4941+
4942+ // Ignore non-recipe values such as arguments, constants, etc.
4943+ // FIXME: Might need some motivation why these values are ignored. If
4944+ // for example an argument is used inside the loop it will increase the
4945+ // register pressure (so shouldn't we add it to LoopInvariants).
4946+ if (!DefR && (!U->getLiveInIRValue () ||
4947+ !isa<Instruction>(U->getLiveInIRValue ())))
4948+ continue ;
4949+
4950+ // If this recipe is outside the loop then record it and continue.
4951+ if (!DefR) {
4952+ LoopInvariants.insert (U);
4953+ continue ;
4954+ }
4955+
4956+ // Overwrite previous end points.
4957+ EndPoint[DefR] = IdxToRecipe.size ();
4958+ Ends.insert (DefR);
4959+ }
4960+ }
4961+ if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4962+ // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4963+ // exiting block, where their increment will get materialized eventually.
4964+ for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4965+ if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4966+ EndPoint[&R] = IdxToRecipe.size ();
4967+ Ends.insert (&R);
4968+ }
4969+ }
4970+ }
4971+ }
4972+
4973+ // Saves the list of intervals that end with the index in 'key'.
4974+ using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4975+ SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4976+
4977+ // Transpose the EndPoints to a list of values that end at each index.
4978+ for (auto &Interval : EndPoint)
4979+ TransposeEnds[Interval.second ].push_back (Interval.first );
4980+
4981+ SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4982+ SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4983+ SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4984+
4985+ LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4986+
4987+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4988+
4989+ const auto &TTICapture = TTI;
4990+ auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4991+ if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4992+ (VF.isScalable () &&
4993+ !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4994+ return 0 ;
4995+ return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4996+ };
4997+
4998+ for (unsigned int Idx = 0 , Sz = IdxToRecipe.size (); Idx < Sz; ++Idx) {
4999+ VPRecipeBase *R = IdxToRecipe[Idx];
5000+
5001+ // Remove all of the recipes that end at this location.
5002+ RecipeList &List = TransposeEnds[Idx];
5003+ for (VPRecipeBase *ToRemove : List)
5004+ OpenIntervals.erase (ToRemove);
5005+
5006+ // Ignore recipes that are never used within the loop.
5007+ if (!Ends.count (R) && !R->mayHaveSideEffects ())
5008+ continue ;
5009+
5010+ // For each VF find the maximum usage of registers.
5011+ for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5012+ // Count the number of registers used, per register class, given all open
5013+ // intervals.
5014+ // Note that elements in this SmallMapVector will be default constructed
5015+ // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5016+ // there is no previous entry for ClassID.
5017+ SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5018+
5019+ if (VFs[J].isScalar ()) {
5020+ for (auto *Inst : OpenIntervals) {
5021+ for (VPValue *DefV : Inst->definedValues ()) {
5022+ unsigned ClassID = TTI.getRegisterClassForType (
5023+ false , TypeInfo.inferScalarType (DefV));
5024+ // FIXME: The target might use more than one register for the type
5025+ // even in the scalar case.
5026+ RegUsage[ClassID] += 1 ;
5027+ }
5028+ }
5029+ } else {
5030+ for (auto *R : OpenIntervals) {
5031+ if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
5032+ continue ;
5033+ if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5034+ VPScalarIVStepsRecipe>(R) ||
5035+ (isa<VPInstruction>(R) &&
5036+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5037+ return cast<VPRecipeBase>(U)->usesScalars (
5038+ R->getVPSingleValue ());
5039+ }))) {
5040+ unsigned ClassID = TTI.getRegisterClassForType (
5041+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5042+ // FIXME: The target might use more than one register for the type
5043+ // even in the scalar case.
5044+ RegUsage[ClassID] += 1 ;
5045+ } else {
5046+ for (VPValue *DefV : R->definedValues ()) {
5047+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5048+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5049+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5050+ }
5051+ }
5052+ }
5053+ }
5054+
5055+ for (const auto &Pair : RegUsage) {
5056+ auto &Entry = MaxUsages[J][Pair.first ];
5057+ Entry = std::max (Entry, Pair.second );
5058+ }
5059+ }
5060+
5061+ LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5062+ << OpenIntervals.size () << ' \n ' );
5063+
5064+ // Add the current recipe to the list of open intervals.
5065+ OpenIntervals.insert (R);
5066+ }
5067+
5068+ for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5069+ // Note that elements in this SmallMapVector will be default constructed
5070+ // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5071+ // there is no previous entry for ClassID.
5072+ SmallMapVector<unsigned , unsigned , 4 > Invariant;
5073+
5074+ for (auto *In : LoopInvariants) {
5075+ // FIXME: The target might use more than one register for the type
5076+ // even in the scalar case.
5077+ bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5078+ return cast<VPRecipeBase>(U)->usesScalars (In);
5079+ });
5080+
5081+ ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5082+ unsigned ClassID = TTI.getRegisterClassForType (
5083+ VF.isVector (), TypeInfo.inferScalarType (In));
5084+ Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5085+ }
5086+
5087+ LLVM_DEBUG ({
5088+ dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5089+ dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5090+ << " item\n " ;
5091+ for (const auto &pair : MaxUsages[Idx]) {
5092+ dbgs () << " LV(REG): RegisterClass: "
5093+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5094+ << " registers\n " ;
5095+ }
5096+ dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5097+ << " item\n " ;
5098+ for (const auto &pair : Invariant) {
5099+ dbgs () << " LV(REG): RegisterClass: "
5100+ << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5101+ << " registers\n " ;
5102+ }
5103+ });
5104+
5105+ RU.LoopInvariantRegs = Invariant;
5106+ RU.MaxLocalUsers = MaxUsages[Idx];
5107+ RUs[Idx] = RU;
5108+ }
5109+
5110+ return RUs;
5111+ }
5112+
48885113unsigned
4889- LoopVectorizationCostModel::selectInterleaveCount (ElementCount VF,
5114+ LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
48905115 InstructionCost LoopCost) {
48915116 // -- The interleave heuristics --
48925117 // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4936,7 +5161,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49365161 return 1 ;
49375162 }
49385163
4939- RegisterUsage R = calculateRegisterUsage ({VF})[0 ];
5164+ RegisterUsage R = :: calculateRegisterUsage (Plan, {VF}, TTI )[0 ];
49405165 // We divide by these constants so assume that we have at least one
49415166 // instruction that uses at least one register.
49425167 for (auto &Pair : R.MaxLocalUsers ) {
@@ -10694,7 +10919,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1069410919 AddBranchWeights, CM.CostKind );
1069510920 if (LVP.hasPlanWithVF (VF.Width )) {
1069610921 // Select the interleave count.
10697- IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
10922+ IC = CM.selectInterleaveCount (LVP. getPlanFor (VF. Width ), VF.Width , VF.Cost );
1069810923
1069910924 unsigned SelectedIC = std::max (IC, UserIC);
1070010925 // Optimistically generate runtime checks if they are needed. Drop them if
0 commit comments