@@ -1022,11 +1022,6 @@ class LoopVectorizationCostModel {
10221022 SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers;
10231023 };
10241024
1025- // / \return Returns information about the register usages of the loop for the
1026- // / given vectorization factors.
1027- SmallVector<RegisterUsage, 8 >
1028- calculateRegisterUsage (ArrayRef<ElementCount> VFs);
1029-
10301025 // / Collect values we want to ignore in the cost model.
10311026 void collectValuesToIgnore ();
10321027
@@ -4189,27 +4184,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
41894184 ComputeScalableMaxVF);
41904185 MaxVectorElementCountMaxBW = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
41914186
4192- // Collect all viable vectorization factors larger than the default MaxVF
4193- // (i.e. MaxVectorElementCount).
4194- SmallVector<ElementCount, 8 > VFs;
4187+ // Set the max VF to the largest viable vectorization factor less than or
4188+ // equal to the max vector element count.
41954189 for (ElementCount VS = MaxVectorElementCount * 2 ;
41964190 ElementCount::isKnownLE (VS, MaxVectorElementCountMaxBW); VS *= 2 )
4197- VFs.push_back (VS);
4198-
4199- // For each VF calculate its register usage.
4200- auto RUs = calculateRegisterUsage (VFs);
4191+ MaxVF = VS;
42014192
4202- // Select the largest VF which doesn't require more registers than existing
4203- // ones.
4204- for (int I = RUs.size () - 1 ; I >= 0 ; --I) {
4205- const auto &MLU = RUs[I].MaxLocalUsers ;
4206- if (all_of (MLU, [&](decltype (MLU.front ()) &LU) {
4207- return LU.second <= TTI.getNumberOfRegisters (LU.first );
4208- })) {
4209- MaxVF = VFs[I];
4210- break ;
4211- }
4212- }
42134193 if (ElementCount MinVF =
42144194 TTI.getMinimumVF (SmallestType, ComputeScalableMaxVF)) {
42154195 if (ElementCount::isKnownLT (MaxVF, MinVF)) {
@@ -5406,213 +5386,6 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
54065386 return 1 ;
54075387}
54085388
5409- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
5410- LoopVectorizationCostModel::calculateRegisterUsage (ArrayRef<ElementCount> VFs) {
5411- // This function calculates the register usage by measuring the highest number
5412- // of values that are alive at a single location. Obviously, this is a very
5413- // rough estimation. We scan the loop in a topological order in order and
5414- // assign a number to each instruction. We use RPO to ensure that defs are
5415- // met before their users. We assume that each instruction that has in-loop
5416- // users starts an interval. We record every time that an in-loop value is
5417- // used, so we have a list of the first and last occurrences of each
5418- // instruction. Next, we transpose this data structure into a multi map that
5419- // holds the list of intervals that *end* at a specific location. This multi
5420- // map allows us to perform a linear search. We scan the instructions linearly
5421- // and record each time that a new interval starts, by placing it in a set.
5422- // If we find this value in the multi-map then we remove it from the set.
5423- // The max register usage is the maximum size of the set.
5424- // We also search for instructions that are defined outside the loop, but are
5425- // used inside the loop. We need this number separately from the max-interval
5426- // usage number because when we unroll, loop-invariant values do not take
5427- // more registers.
5428- LoopBlocksDFS DFS (TheLoop);
5429- DFS.perform (LI);
5430-
5431- RegisterUsage RU;
5432-
5433- // Each 'key' in the map opens a new interval. The values
5434- // of the map are the index of the 'last seen' usage of the
5435- // instruction that is the key.
5436- using IntervalMap = SmallDenseMap<Instruction *, unsigned , 16 >;
5437-
5438- // Maps instruction to its index.
5439- SmallVector<Instruction *, 64 > IdxToInstr;
5440- // Marks the end of each interval.
5441- IntervalMap EndPoint;
5442- // Saves the list of instruction indices that are used in the loop.
5443- SmallPtrSet<Instruction *, 8 > Ends;
5444- // Saves the list of values that are used in the loop but are defined outside
5445- // the loop (not including non-instruction values such as arguments and
5446- // constants).
5447- SmallSetVector<Instruction *, 8 > LoopInvariants;
5448-
5449- for (BasicBlock *BB : make_range (DFS.beginRPO (), DFS.endRPO ())) {
5450- for (Instruction &I : BB->instructionsWithoutDebug ()) {
5451- IdxToInstr.push_back (&I);
5452-
5453- // Save the end location of each USE.
5454- for (Value *U : I.operands ()) {
5455- auto *Instr = dyn_cast<Instruction>(U);
5456-
5457- // Ignore non-instruction values such as arguments, constants, etc.
5458- // FIXME: Might need some motivation why these values are ignored. If
5459- // for example an argument is used inside the loop it will increase the
5460- // register pressure (so shouldn't we add it to LoopInvariants).
5461- if (!Instr)
5462- continue ;
5463-
5464- // If this instruction is outside the loop then record it and continue.
5465- if (!TheLoop->contains (Instr)) {
5466- LoopInvariants.insert (Instr);
5467- continue ;
5468- }
5469-
5470- // Overwrite previous end points.
5471- EndPoint[Instr] = IdxToInstr.size ();
5472- Ends.insert (Instr);
5473- }
5474- }
5475- }
5476-
5477- // Saves the list of intervals that end with the index in 'key'.
5478- using InstrList = SmallVector<Instruction *, 2 >;
5479- SmallDenseMap<unsigned , InstrList, 16 > TransposeEnds;
5480-
5481- // Transpose the EndPoints to a list of values that end at each index.
5482- for (auto &Interval : EndPoint)
5483- TransposeEnds[Interval.second ].push_back (Interval.first );
5484-
5485- SmallPtrSet<Instruction *, 8 > OpenIntervals;
5486- SmallVector<RegisterUsage, 8 > RUs (VFs.size ());
5487- SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
5488-
5489- LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
5490-
5491- const auto &TTICapture = TTI;
5492- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5493- if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
5494- (VF.isScalable () &&
5495- !TTICapture.isElementTypeLegalForScalableVector (Ty)))
5496- return 0 ;
5497- return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
5498- };
5499-
5500- collectInLoopReductions ();
5501-
5502- for (unsigned int Idx = 0 , Sz = IdxToInstr.size (); Idx < Sz; ++Idx) {
5503- Instruction *I = IdxToInstr[Idx];
5504-
5505- // Remove all of the instructions that end at this location.
5506- InstrList &List = TransposeEnds[Idx];
5507- for (Instruction *ToRemove : List)
5508- OpenIntervals.erase (ToRemove);
5509-
5510- // Ignore instructions that are never used within the loop and do not have
5511- // side-effects.
5512- if (!Ends.count (I) && !I->mayHaveSideEffects ())
5513- continue ;
5514-
5515- // Skip ignored values.
5516- if (ValuesToIgnore.count (I))
5517- continue ;
5518-
5519- // For each VF find the maximum usage of registers.
5520- for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5521- // Count the number of registers used, per register class, given all open
5522- // intervals.
5523- // Note that elements in this SmallMapVector will be default constructed
5524- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5525- // there is no previous entry for ClassID.
5526- SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5527-
5528- if (VFs[J].isScalar ()) {
5529- for (auto *Inst : OpenIntervals) {
5530- unsigned ClassID =
5531- TTI.getRegisterClassForType (false , Inst->getType ());
5532- // FIXME: The target might use more than one register for the type
5533- // even in the scalar case.
5534- RegUsage[ClassID] += 1 ;
5535- }
5536- } else {
5537- collectNonVectorizedAndSetWideningDecisions (VFs[J]);
5538- for (auto *Inst : OpenIntervals) {
5539- // Skip ignored values for VF > 1.
5540- if (VecValuesToIgnore.count (Inst))
5541- continue ;
5542- if (isScalarAfterVectorization (Inst, VFs[J])) {
5543- unsigned ClassID =
5544- TTI.getRegisterClassForType (false , Inst->getType ());
5545- // FIXME: The target might use more than one register for the type
5546- // even in the scalar case.
5547- RegUsage[ClassID] += 1 ;
5548- } else {
5549- unsigned ClassID =
5550- TTI.getRegisterClassForType (true , Inst->getType ());
5551- RegUsage[ClassID] += GetRegUsage (Inst->getType (), VFs[J]);
5552- }
5553- }
5554- }
5555-
5556- for (const auto &Pair : RegUsage) {
5557- auto &Entry = MaxUsages[J][Pair.first ];
5558- Entry = std::max (Entry, Pair.second );
5559- }
5560- }
5561-
5562- LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5563- << OpenIntervals.size () << ' \n ' );
5564-
5565- // Add the current instruction to the list of open intervals.
5566- OpenIntervals.insert (I);
5567- }
5568-
5569- for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5570- // Note that elements in this SmallMapVector will be default constructed
5571- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5572- // there is no previous entry for ClassID.
5573- SmallMapVector<unsigned , unsigned , 4 > Invariant;
5574-
5575- for (auto *Inst : LoopInvariants) {
5576- // FIXME: The target might use more than one register for the type
5577- // even in the scalar case.
5578- bool IsScalar = all_of (Inst->users (), [&](User *U) {
5579- auto *I = cast<Instruction>(U);
5580- return TheLoop != LI->getLoopFor (I->getParent ()) ||
5581- isScalarAfterVectorization (I, VFs[Idx]);
5582- });
5583-
5584- ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5585- unsigned ClassID =
5586- TTI.getRegisterClassForType (VF.isVector (), Inst->getType ());
5587- Invariant[ClassID] += GetRegUsage (Inst->getType (), VF);
5588- }
5589-
5590- LLVM_DEBUG ({
5591- dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5592- dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5593- << " item\n " ;
5594- for (const auto &pair : MaxUsages[Idx]) {
5595- dbgs () << " LV(REG): RegisterClass: "
5596- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5597- << " registers\n " ;
5598- }
5599- dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5600- << " item\n " ;
5601- for (const auto &pair : Invariant) {
5602- dbgs () << " LV(REG): RegisterClass: "
5603- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5604- << " registers\n " ;
5605- }
5606- });
5607-
5608- RU.LoopInvariantRegs = Invariant;
5609- RU.MaxLocalUsers = MaxUsages[Idx];
5610- RUs[Idx] = RU;
5611- }
5612-
5613- return RUs;
5614- }
5615-
56165389bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack (Instruction *I,
56175390 ElementCount VF) {
56185391 // TODO: Cost model for emulated masked load/store is completely
@@ -7780,7 +7553,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
77807553 }
77817554
77827555 for (auto &P : VPlans) {
7783- for (ElementCount VF : P->vectorFactors ()) {
7556+ SmallVector<ElementCount, 1 > VFs (P->vectorFactors ());
7557+ auto RUs = ::calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
7558+ for (unsigned I = 0 ; I < VFs.size (); I++) {
7559+ auto VF = VFs[I];
77847560 if (VF.isScalar ())
77857561 continue ;
77867562 if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -7801,12 +7577,23 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78017577
78027578 InstructionCost Cost = cost (*P, VF);
78037579 VectorizationFactor CurrentFactor (VF, Cost, ScalarCost);
7804- if (isMoreProfitable (CurrentFactor, BestFactor, P->hasScalarTail ()))
7805- BestFactor = CurrentFactor;
7806-
78077580 // If profitable add it to ProfitableVF list.
78087581 if (isMoreProfitable (CurrentFactor, ScalarFactor, P->hasScalarTail ()))
78097582 ProfitableVFs.push_back (CurrentFactor);
7583+
7584+ // Make sure that the VF doesn't use more than the number of available
7585+ // registers
7586+ const auto &MLU = RUs[I].MaxLocalUsers ;
7587+ if (any_of (MLU, [&](decltype (MLU.front ()) &LU) {
7588+ return LU.second > TTI.getNumberOfRegisters (LU.first );
7589+ })) {
7590+ LLVM_DEBUG (dbgs () << " LV(REG): Ignoring VF " << VF
7591+ << " as it uses too many registers\n " );
7592+ continue ;
7593+ }
7594+
7595+ if (isMoreProfitable (CurrentFactor, BestFactor, P->hasScalarTail ()))
7596+ BestFactor = CurrentFactor;
78107597 }
78117598 }
78127599
@@ -7818,6 +7605,30 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78187605 VectorizationFactor LegacyVF = selectVectorizationFactor ();
78197606 VPlan &BestPlan = getPlanFor (BestFactor.Width );
78207607
7608+ // VPlan calculates register pressure from the plan, so it can come to
7609+ // different conclusions than the legacy cost model.
7610+ bool RegUsageDeterminedVF = false ;
7611+ if (BestFactor.Width != LegacyVF.Width ) {
7612+ SmallVector<ElementCount, 1 > LegacyVFs = {LegacyVF.Width };
7613+ SmallVector<ElementCount, 1 > VFs = {BestFactor.Width };
7614+
7615+ auto LegacyRUs =
7616+ ::calculateRegisterUsage (getPlanFor(LegacyVF.Width), LegacyVFs, TTI, CM.ValuesToIgnore);
7617+ auto RUs = ::calculateRegisterUsage (BestPlan, VFs, TTI, CM.ValuesToIgnore );
7618+
7619+ auto GetMaxUsage = [](
7620+ SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers) {
7621+ unsigned Max = 0 ;
7622+ for (auto Pair : MaxLocalUsers)
7623+ if (Pair.second > Max)
7624+ Max = Pair.second ;
7625+ return Max;
7626+ };
7627+ unsigned MaxLegacyRegUsage = GetMaxUsage (LegacyRUs[0 ].MaxLocalUsers );
7628+ unsigned MaxRegUsage = GetMaxUsage (RUs[0 ].MaxLocalUsers );
7629+ RegUsageDeterminedVF = MaxRegUsage <= MaxLegacyRegUsage;
7630+ }
7631+
78217632 // Pre-compute the cost and use it to check if BestPlan contains any
78227633 // simplifications not accounted for in the legacy cost model. If that's the
78237634 // case, don't trigger the assertion, as the extra simplifications may cause a
@@ -7829,6 +7640,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
78297640 // with early exits and plans with additional VPlan simplifications. The
78307641 // legacy cost model doesn't properly model costs for such loops.
78317642 assert ((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit () ||
7643+ RegUsageDeterminedVF ||
78327644 planContainsAdditionalSimplifications (getPlanFor (BestFactor.Width ),
78337645 CostCtx, OrigLoop) ||
78347646 planContainsAdditionalSimplifications (getPlanFor (LegacyVF.Width ),
0 commit comments