@@ -4872,31 +4872,14 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48724872 }
48734873}
48744874
4875- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs.
4876- // / Returns the register usage for each VF in \p VFs.
4875+ // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4876+ // / by calculating the highest number of values that are live at a single
4877+ // / location as a rough estimate. Returns the register usage for each VF in \p
4878+ // / VFs.
48774879static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
48784880calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
48794881 const TargetTransformInfo &TTI,
48804882 const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4881- // This function calculates the register usage by measuring the highest number
4882- // of values that are alive at a single location. Obviously, this is a very
4883- // rough estimation. We scan the loop in a topological order in order and
4884- // assign a number to each recipe. We use RPO to ensure that defs are
4885- // met before their users. We assume that each recipe that has in-loop
4886- // users starts an interval. We record every time that an in-loop value is
4887- // used, so we have a list of the first and last occurrences of each
4888- // recipe. Next, we transpose this data structure into a multi map that
4889- // holds the list of intervals that *end* at a specific location. This multi
4890- // map allows us to perform a linear search. We scan the instructions linearly
4891- // and record each time that a new interval starts, by placing it in a set.
4892- // If we find this value in the multi-map then we remove it from the set.
4893- // The max register usage is the maximum size of the set.
4894- // We also search for instructions that are defined outside the loop, but are
4895- // used inside the loop. We need this number separately from the max-interval
4896- // usage number because when we unroll, loop-invariant values do not take
4897- // more register.
4898- LoopVectorizationCostModel::RegisterUsage RU;
4899-
49004883 // Each 'key' in the map opens a new interval. The values
49014884 // of the map are the index of the 'last seen' usage of the
49024885 // recipe that is the key.
@@ -4914,6 +4897,11 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
49144897 SmallSetVector<VPValue *, 8 > LoopInvariants;
49154898 LoopInvariants.insert (&Plan.getVectorTripCount ());
49164899
4900+ // We scan the loop in a topological order in order and assign a number to
4901+ // each recipe. We use RPO to ensure that defs are met before their users. We
4902+ // assume that each recipe that has in-loop users starts an interval. We
4903+ // record every time that an in-loop value is used, so we have a list of the
4904+ // first and last occurrences of each recipe.
49174905 ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
49184906 Plan.getVectorLoopRegion ());
49194907 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
@@ -4961,7 +4949,8 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
49614949 using RecipeList = SmallVector<VPRecipeBase *, 2 >;
49624950 SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
49634951
4964- // Transpose the EndPoints to a list of values that end at each index.
4952+ // Next, we transpose the EndPoints into a multi map that holds the list of
4953+ // intervals that *end* at a specific location.
49654954 for (auto &Interval : EndPoint)
49664955 TransposeEnds[Interval.second ].push_back (Interval.first );
49674956
@@ -4982,10 +4971,14 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
49824971 return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
49834972 };
49844973
4974+ // We scan the instructions linearly and record each time that a new interval
4975+ // starts, by placing it in a set. If we find this value in TransposEnds then
4976+ // we remove it from the set. The max register usage is the maximum register
4977+ // usage of the recipes of the set.
49854978 for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
49864979 VPRecipeBase *R = Idx2Recipe[Idx];
49874980
4988- // Remove all of the recipes that end at this location.
4981+ // Remove all of the recipes that end at this location.
49894982 RecipeList &List = TransposeEnds[Idx];
49904983 for (VPRecipeBase *ToRemove : List)
49914984 OpenIntervals.erase (ToRemove);
@@ -5012,38 +5005,31 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50125005 // there is no previous entry for ClassID.
50135006 SmallMapVector<unsigned , unsigned , 4 > RegUsage;
50145007
5015- if (VFs[J].isScalar ()) {
5016- for (auto *Inst : OpenIntervals) {
5017- for (VPValue *DefV : Inst->definedValues ()) {
5018- unsigned ClassID = TTI.getRegisterClassForType (
5019- false , TypeInfo.inferScalarType (DefV));
5020- // FIXME: The target might use more than one register for the type
5021- // even in the scalar case.
5022- RegUsage[ClassID] += 1 ;
5023- }
5024- }
5025- } else {
5026- for (auto *R : OpenIntervals) {
5027- if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
5028- continue ;
5029- if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5030- VPScalarIVStepsRecipe>(R) ||
5031- (isa<VPInstruction>(R) &&
5032- all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5033- return cast<VPRecipeBase>(U)->usesScalars (
5034- R->getVPSingleValue ());
5035- }))) {
5036- unsigned ClassID = TTI.getRegisterClassForType (
5037- false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5038- // FIXME: The target might use more than one register for the type
5039- // even in the scalar case.
5040- RegUsage[ClassID] += 1 ;
5041- } else {
5042- for (VPValue *DefV : R->definedValues ()) {
5043- Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5044- unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5045- RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
5046- }
5008+ for (auto *R : OpenIntervals) {
5009+ // Skip recipes that weren't present in the original loop.
5010+ // TODO: Remove after removing the legacy
5011+ // LoopVectorizationCostModel::calculateRegisterUsage
5012+ if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5013+ VPBranchOnMaskRecipe>(R))
5014+ continue ;
5015+
5016+ if (VFs[J].isScalar () ||
5017+ isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5018+ VPScalarIVStepsRecipe>(R) ||
5019+ (isa<VPInstruction>(R) &&
5020+ all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5021+ return cast<VPRecipeBase>(U)->usesScalars (R->getVPSingleValue ());
5022+ }))) {
5023+ unsigned ClassID = TTI.getRegisterClassForType (
5024+ false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5025+ // FIXME: The target might use more than one register for the type
5026+ // even in the scalar case.
5027+ RegUsage[ClassID] += 1 ;
5028+ } else {
5029+ for (VPValue *DefV : R->definedValues ()) {
5030+ Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5031+ unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5032+ RegUsage[ClassID] += GetRegUsage (ScalarTy, VFs[J]);
50475033 }
50485034 }
50495035 }
@@ -5061,6 +5047,11 @@ calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
50615047 OpenIntervals.insert (R);
50625048 }
50635049
5050+ // We also search for instructions that are defined outside the loop, but are
5051+ // used inside the loop. We need this number separately from the max-interval
5052+ // usage number because when we unroll, loop-invariant values do not take
5053+ // more register.
5054+ LoopVectorizationCostModel::RegisterUsage RU;
50645055 for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
50655056 // Note that elements in this SmallMapVector will be default constructed
50665057 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
0 commit comments