@@ -987,25 +987,6 @@ class LoopVectorizationCostModel {
987987 // / decision in a map for use in planning and plan execution.
988988 void setVectorizedCallDecision (ElementCount VF);
989989
990- // / A struct that represents some properties of the register usage
991- // / of a loop.
992- struct RegisterUsage {
993- // / Holds the number of loop invariant values that are used in the loop.
994- // / The key is ClassID of target-provided register class.
995- SmallMapVector<unsigned , unsigned , 4 > LoopInvariantRegs;
996- // / Holds the maximum number of concurrent live intervals in the loop.
997- // / The key is ClassID of target-provided register class.
998- SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers;
999-
1000- // / Check if any of the tracked live intervals exceeds the number of
1001- // / available registers for the target.
1002- bool exceedsMaxNumRegs (const TargetTransformInfo &TTI) const {
1003- return any_of (MaxLocalUsers, [&TTI](auto &LU) {
1004- return LU.second > TTI.getNumberOfRegisters (LU.first );
1005- });
1006- }
1007- };
1008-
1009990 // / Collect values we want to ignore in the cost model.
1010991 void collectValuesToIgnore ();
1011992
@@ -4343,15 +4324,6 @@ static bool hasReplicatorRegion(VPlan &Plan) {
43434324}
43444325
43454326#ifndef NDEBUG
4346- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4347- // / by calculating the highest number of values that are live at a single
4348- // / location as a rough estimate. Returns the register usage for each VF in \p
4349- // / VFs.
4350- static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4351- calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4352- const TargetTransformInfo &TTI,
4353- const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
4354-
43554327VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
43564328 InstructionCost ExpectedCost = CM.expectedCost (ElementCount::getFixed (1 ));
43574329 LLVM_DEBUG (dbgs () << " LV: Scalar loop costs: " << ExpectedCost << " .\n " );
@@ -4377,7 +4349,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
43774349 for (auto &P : VPlans) {
43784350 ArrayRef<ElementCount> VFs (P->vectorFactors ().begin (),
43794351 P->vectorFactors ().end ());
4380- auto RUs = :: calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
4352+ auto RUs = calculateRegisterUsageForPlan (*P, VFs, TTI, CM.ValuesToIgnore );
43814353 for (auto [VF, RU] : zip_equal (VFs, RUs)) {
43824354 // The cost for scalar VF=1 is already calculated, so ignore it.
43834355 if (VF.isScalar ())
@@ -4704,254 +4676,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
47044676 }
47054677}
47064678
4707- // / Get the VF scaling factor applied to the recipe's output, if the recipe has
4708- // / one.
4709- static unsigned getVFScaleFactor (VPRecipeBase *R) {
4710- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4711- return RR->getVFScaleFactor ();
4712- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4713- return RR->getVFScaleFactor ();
4714- return 1 ;
4715- }
4716-
4717- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4718- // / by calculating the highest number of values that are live at a single
4719- // / location as a rough estimate. Returns the register usage for each VF in \p
4720- // / VFs.
4721- static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4722- calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4723- const TargetTransformInfo &TTI,
4724- const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4725- // Each 'key' in the map opens a new interval. The values
4726- // of the map are the index of the 'last seen' usage of the
4727- // recipe that is the key.
4728- using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4729-
4730- // Maps indices to recipes.
4731- SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4732- // Marks the end of each interval.
4733- IntervalMap EndPoint;
4734- // Saves the list of recipe indices that are used in the loop.
4735- SmallPtrSet<VPRecipeBase *, 8 > Ends;
4736- // Saves the list of values that are used in the loop but are defined outside
4737- // the loop (not including non-recipe values such as arguments and
4738- // constants).
4739- SmallSetVector<VPValue *, 8 > LoopInvariants;
4740- LoopInvariants.insert (&Plan.getVectorTripCount ());
4741-
4742- // We scan the loop in a topological order in order and assign a number to
4743- // each recipe. We use RPO to ensure that defs are met before their users. We
4744- // assume that each recipe that has in-loop users starts an interval. We
4745- // record every time that an in-loop value is used, so we have a list of the
4746- // first and last occurrences of each recipe.
4747- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4748- Plan.getVectorLoopRegion ());
4749- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4750- if (!VPBB->getParent ())
4751- break ;
4752- for (VPRecipeBase &R : *VPBB) {
4753- Idx2Recipe.push_back (&R);
4754-
4755- // Save the end location of each USE.
4756- for (VPValue *U : R.operands ()) {
4757- auto *DefR = U->getDefiningRecipe ();
4758-
4759- // Ignore non-recipe values such as arguments, constants, etc.
4760- // FIXME: Might need some motivation why these values are ignored. If
4761- // for example an argument is used inside the loop it will increase the
4762- // register pressure (so shouldn't we add it to LoopInvariants).
4763- if (!DefR && (!U->getLiveInIRValue () ||
4764- !isa<Instruction>(U->getLiveInIRValue ())))
4765- continue ;
4766-
4767- // If this recipe is outside the loop then record it and continue.
4768- if (!DefR) {
4769- LoopInvariants.insert (U);
4770- continue ;
4771- }
4772-
4773- // Overwrite previous end points.
4774- EndPoint[DefR] = Idx2Recipe.size ();
4775- Ends.insert (DefR);
4776- }
4777- }
4778- if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4779- // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4780- // exiting block, where their increment will get materialized eventually.
4781- for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4782- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4783- EndPoint[&R] = Idx2Recipe.size ();
4784- Ends.insert (&R);
4785- }
4786- }
4787- }
4788- }
4789-
4790- // Saves the list of intervals that end with the index in 'key'.
4791- using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4792- SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4793-
4794- // Next, we transpose the EndPoints into a multi map that holds the list of
4795- // intervals that *end* at a specific location.
4796- for (auto &Interval : EndPoint)
4797- TransposeEnds[Interval.second ].push_back (Interval.first );
4798-
4799- SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4800- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4801- SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4802-
4803- LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4804-
4805- VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4806-
4807- const auto &TTICapture = TTI;
4808- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4809- if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4810- (VF.isScalable () &&
4811- !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4812- return 0 ;
4813- return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4814- };
4815-
4816- // We scan the instructions linearly and record each time that a new interval
4817- // starts, by placing it in a set. If we find this value in TransposEnds then
4818- // we remove it from the set. The max register usage is the maximum register
4819- // usage of the recipes of the set.
4820- for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4821- VPRecipeBase *R = Idx2Recipe[Idx];
4822-
4823- // Remove all of the recipes that end at this location.
4824- RecipeList &List = TransposeEnds[Idx];
4825- for (VPRecipeBase *ToRemove : List)
4826- OpenIntervals.erase (ToRemove);
4827-
4828- // Ignore recipes that are never used within the loop and do not have side
4829- // effects.
4830- if (!Ends.count (R) && !R->mayHaveSideEffects ())
4831- continue ;
4832-
4833- // Skip recipes for ignored values.
4834- // TODO: Should mark recipes for ephemeral values that cannot be removed
4835- // explictly in VPlan.
4836- if (isa<VPSingleDefRecipe>(R) &&
4837- ValuesToIgnore.contains (
4838- cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4839- continue ;
4840-
4841- // For each VF find the maximum usage of registers.
4842- for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
4843- // Count the number of registers used, per register class, given all open
4844- // intervals.
4845- // Note that elements in this SmallMapVector will be default constructed
4846- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
4847- // there is no previous entry for ClassID.
4848- SmallMapVector<unsigned , unsigned , 4 > RegUsage;
4849-
4850- for (auto *R : OpenIntervals) {
4851- // Skip recipes that weren't present in the original loop.
4852- // TODO: Remove after removing the legacy
4853- // LoopVectorizationCostModel::calculateRegisterUsage
4854- if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
4855- VPBranchOnMaskRecipe>(R))
4856- continue ;
4857-
4858- if (VFs[J].isScalar () ||
4859- isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
4860- VPScalarIVStepsRecipe>(R) ||
4861- (isa<VPInstruction>(R) &&
4862- all_of (cast<VPSingleDefRecipe>(R)->users (),
4863- [&](VPUser *U) {
4864- return cast<VPRecipeBase>(U)->usesScalars (
4865- R->getVPSingleValue ());
4866- })) ||
4867- (isa<VPReductionPHIRecipe>(R) &&
4868- (cast<VPReductionPHIRecipe>(R))->isInLoop ())) {
4869- unsigned ClassID = TTI.getRegisterClassForType (
4870- false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
4871- // FIXME: The target might use more than one register for the type
4872- // even in the scalar case.
4873- RegUsage[ClassID] += 1 ;
4874- } else {
4875- // The output from scaled phis and scaled reductions actually has
4876- // fewer lanes than the VF.
4877- unsigned ScaleFactor = getVFScaleFactor (R);
4878- ElementCount VF = VFs[J].divideCoefficientBy (ScaleFactor);
4879- LLVM_DEBUG (if (VF != VFs[J]) {
4880- dbgs () << " LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
4881- << " for " << *R << " \n " ;
4882- });
4883-
4884- for (VPValue *DefV : R->definedValues ()) {
4885- Type *ScalarTy = TypeInfo.inferScalarType (DefV);
4886- unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
4887- RegUsage[ClassID] += GetRegUsage (ScalarTy, VF);
4888- }
4889- }
4890- }
4891-
4892- for (const auto &Pair : RegUsage) {
4893- auto &Entry = MaxUsages[J][Pair.first ];
4894- Entry = std::max (Entry, Pair.second );
4895- }
4896- }
4897-
4898- LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
4899- << OpenIntervals.size () << ' \n ' );
4900-
4901- // Add the current recipe to the list of open intervals.
4902- OpenIntervals.insert (R);
4903- }
4904-
4905- // We also search for instructions that are defined outside the loop, but are
4906- // used inside the loop. We need this number separately from the max-interval
4907- // usage number because when we unroll, loop-invariant values do not take
4908- // more register.
4909- LoopVectorizationCostModel::RegisterUsage RU;
4910- for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
4911- // Note that elements in this SmallMapVector will be default constructed
4912- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
4913- // there is no previous entry for ClassID.
4914- SmallMapVector<unsigned , unsigned , 4 > Invariant;
4915-
4916- for (auto *In : LoopInvariants) {
4917- // FIXME: The target might use more than one register for the type
4918- // even in the scalar case.
4919- bool IsScalar = all_of (In->users (), [&](VPUser *U) {
4920- return cast<VPRecipeBase>(U)->usesScalars (In);
4921- });
4922-
4923- ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
4924- unsigned ClassID = TTI.getRegisterClassForType (
4925- VF.isVector (), TypeInfo.inferScalarType (In));
4926- Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
4927- }
4928-
4929- LLVM_DEBUG ({
4930- dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
4931- dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
4932- << " item\n " ;
4933- for (const auto &pair : MaxUsages[Idx]) {
4934- dbgs () << " LV(REG): RegisterClass: "
4935- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
4936- << " registers\n " ;
4937- }
4938- dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
4939- << " item\n " ;
4940- for (const auto &pair : Invariant) {
4941- dbgs () << " LV(REG): RegisterClass: "
4942- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
4943- << " registers\n " ;
4944- }
4945- });
4946-
4947- RU.LoopInvariantRegs = Invariant;
4948- RU.MaxLocalUsers = MaxUsages[Idx];
4949- RUs[Idx] = RU;
4950- }
4951-
4952- return RUs;
4953- }
4954-
49554679unsigned
49564680LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
49574681 InstructionCost LoopCost) {
@@ -5002,8 +4726,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
50024726 return 1 ;
50034727 }
50044728
5005- RegisterUsage R =
5006- ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
4729+ VPRegisterUsage R =
4730+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
50074731 // We divide by these constants so assume that we have at least one
50084732 // instruction that uses at least one register.
50094733 for (auto &Pair : R.MaxLocalUsers ) {
@@ -7380,7 +7104,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
73807104 for (auto &P : VPlans) {
73817105 ArrayRef<ElementCount> VFs (P->vectorFactors ().begin (),
73827106 P->vectorFactors ().end ());
7383- auto RUs = :: calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
7107+ auto RUs = calculateRegisterUsageForPlan (*P, VFs, TTI, CM.ValuesToIgnore );
73847108 for (auto [VF, RU] : zip_equal (VFs, RUs)) {
73857109 if (VF.isScalar ())
73867110 continue ;
0 commit comments