diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9ace195684b3..2118b4c8a645f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -987,25 +987,6 @@ class LoopVectorizationCostModel { /// decision in a map for use in planning and plan execution. void setVectorizedCallDecision(ElementCount VF); - /// A struct that represents some properties of the register usage - /// of a loop. - struct RegisterUsage { - /// Holds the number of loop invariant values that are used in the loop. - /// The key is ClassID of target-provided register class. - SmallMapVector LoopInvariantRegs; - /// Holds the maximum number of concurrent live intervals in the loop. - /// The key is ClassID of target-provided register class. - SmallMapVector MaxLocalUsers; - - /// Check if any of the tracked live intervals exceeds the number of - /// available registers for the target. - bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { - return any_of(MaxLocalUsers, [&TTI](auto &LU) { - return LU.second > TTI.getNumberOfRegisters(LU.first); - }); - } - }; - /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -4343,15 +4324,6 @@ static bool hasReplicatorRegion(VPlan &Plan) { } #ifndef NDEBUG -/// Estimate the register usage for \p Plan and vectorization factors in \p VFs -/// by calculating the highest number of values that are live at a single -/// location as a rough estimate. Returns the register usage for each VF in \p -/// VFs. -static SmallVector -calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, - const TargetTransformInfo &TTI, - const SmallPtrSetImpl &ValuesToIgnore); - VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1)); LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); @@ -4377,7 +4349,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { for (auto &P : VPlans) { ArrayRef VFs(P->vectorFactors().begin(), P->vectorFactors().end()); - auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); + auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (auto [VF, RU] : zip_equal(VFs, RUs)) { // The cost for scalar VF=1 is already calculated, so ignore it. if (VF.isScalar()) @@ -4704,254 +4676,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() { } } -/// Get the VF scaling factor applied to the recipe's output, if the recipe has -/// one. -static unsigned getVFScaleFactor(VPRecipeBase *R) { - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - return 1; -} - -/// Estimate the register usage for \p Plan and vectorization factors in \p VFs -/// by calculating the highest number of values that are live at a single -/// location as a rough estimate. Returns the register usage for each VF in \p -/// VFs. -static SmallVector -calculateRegisterUsage(VPlan &Plan, ArrayRef VFs, - const TargetTransformInfo &TTI, - const SmallPtrSetImpl &ValuesToIgnore) { - // Each 'key' in the map opens a new interval. The values - // of the map are the index of the 'last seen' usage of the - // recipe that is the key. - using IntervalMap = SmallDenseMap; - - // Maps indices to recipes. - SmallVector Idx2Recipe; - // Marks the end of each interval. - IntervalMap EndPoint; - // Saves the list of recipe indices that are used in the loop. - SmallPtrSet Ends; - // Saves the list of values that are used in the loop but are defined outside - // the loop (not including non-recipe values such as arguments and - // constants). - SmallSetVector LoopInvariants; - LoopInvariants.insert(&Plan.getVectorTripCount()); - - // We scan the loop in a topological order in order and assign a number to - // each recipe. We use RPO to ensure that defs are met before their users. We - // assume that each recipe that has in-loop users starts an interval. We - // record every time that an in-loop value is used, so we have a list of the - // first and last occurrences of each recipe. - ReversePostOrderTraversal> RPOT( - Plan.getVectorLoopRegion()); - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { - if (!VPBB->getParent()) - break; - for (VPRecipeBase &R : *VPBB) { - Idx2Recipe.push_back(&R); - - // Save the end location of each USE. - for (VPValue *U : R.operands()) { - auto *DefR = U->getDefiningRecipe(); - - // Ignore non-recipe values such as arguments, constants, etc. - // FIXME: Might need some motivation why these values are ignored. If - // for example an argument is used inside the loop it will increase the - // register pressure (so shouldn't we add it to LoopInvariants). - if (!DefR && (!U->getLiveInIRValue() || - !isa(U->getLiveInIRValue()))) - continue; - - // If this recipe is outside the loop then record it and continue. - if (!DefR) { - LoopInvariants.insert(U); - continue; - } - - // Overwrite previous end points. - EndPoint[DefR] = Idx2Recipe.size(); - Ends.insert(DefR); - } - } - if (VPBB == Plan.getVectorLoopRegion()->getExiting()) { - // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the - // exiting block, where their increment will get materialized eventually. - for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { - if (isa(&R)) { - EndPoint[&R] = Idx2Recipe.size(); - Ends.insert(&R); - } - } - } - } - - // Saves the list of intervals that end with the index in 'key'. - using RecipeList = SmallVector; - SmallDenseMap TransposeEnds; - - // Next, we transpose the EndPoints into a multi map that holds the list of - // intervals that *end* at a specific location. - for (auto &Interval : EndPoint) - TransposeEnds[Interval.second].push_back(Interval.first); - - SmallPtrSet OpenIntervals; - SmallVector RUs(VFs.size()); - SmallVector, 8> MaxUsages(VFs.size()); - - LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); - - VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); - - const auto &TTICapture = TTI; - auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { - if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || - (VF.isScalable() && - !TTICapture.isElementTypeLegalForScalableVector(Ty))) - return 0; - return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); - }; - - // We scan the instructions linearly and record each time that a new interval - // starts, by placing it in a set. If we find this value in TransposEnds then - // we remove it from the set. The max register usage is the maximum register - // usage of the recipes of the set. - for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { - VPRecipeBase *R = Idx2Recipe[Idx]; - - // Remove all of the recipes that end at this location. - RecipeList &List = TransposeEnds[Idx]; - for (VPRecipeBase *ToRemove : List) - OpenIntervals.erase(ToRemove); - - // Ignore recipes that are never used within the loop and do not have side - // effects. - if (!Ends.count(R) && !R->mayHaveSideEffects()) - continue; - - // Skip recipes for ignored values. - // TODO: Should mark recipes for ephemeral values that cannot be removed - // explictly in VPlan. - if (isa(R) && - ValuesToIgnore.contains( - cast(R)->getUnderlyingValue())) - continue; - - // For each VF find the maximum usage of registers. - for (unsigned J = 0, E = VFs.size(); J < E; ++J) { - // Count the number of registers used, per register class, given all open - // intervals. - // Note that elements in this SmallMapVector will be default constructed - // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if - // there is no previous entry for ClassID. - SmallMapVector RegUsage; - - for (auto *R : OpenIntervals) { - // Skip recipes that weren't present in the original loop. - // TODO: Remove after removing the legacy - // LoopVectorizationCostModel::calculateRegisterUsage - if (isa(R)) - continue; - - if (VFs[J].isScalar() || - isa(R) || - (isa(R) && - all_of(cast(R)->users(), - [&](VPUser *U) { - return cast(U)->usesScalars( - R->getVPSingleValue()); - })) || - (isa(R) && - (cast(R))->isInLoop())) { - unsigned ClassID = TTI.getRegisterClassForType( - false, TypeInfo.inferScalarType(R->getVPSingleValue())); - // FIXME: The target might use more than one register for the type - // even in the scalar case. - RegUsage[ClassID] += 1; - } else { - // The output from scaled phis and scaled reductions actually has - // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(R); - ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); - LLVM_DEBUG(if (VF != VFs[J]) { - dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF - << " for " << *R << "\n"; - }); - - for (VPValue *DefV : R->definedValues()) { - Type *ScalarTy = TypeInfo.inferScalarType(DefV); - unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); - RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); - } - } - } - - for (const auto &Pair : RegUsage) { - auto &Entry = MaxUsages[J][Pair.first]; - Entry = std::max(Entry, Pair.second); - } - } - - LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " - << OpenIntervals.size() << '\n'); - - // Add the current recipe to the list of open intervals. - OpenIntervals.insert(R); - } - - // We also search for instructions that are defined outside the loop, but are - // used inside the loop. We need this number separately from the max-interval - // usage number because when we unroll, loop-invariant values do not take - // more register. - LoopVectorizationCostModel::RegisterUsage RU; - for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { - // Note that elements in this SmallMapVector will be default constructed - // as 0. So we can use "Invariant[ClassID] += n" in the code below even if - // there is no previous entry for ClassID. - SmallMapVector Invariant; - - for (auto *In : LoopInvariants) { - // FIXME: The target might use more than one register for the type - // even in the scalar case. - bool IsScalar = all_of(In->users(), [&](VPUser *U) { - return cast(U)->usesScalars(In); - }); - - ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; - unsigned ClassID = TTI.getRegisterClassForType( - VF.isVector(), TypeInfo.inferScalarType(In)); - Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF); - } - - LLVM_DEBUG({ - dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; - dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() - << " item\n"; - for (const auto &pair : MaxUsages[Idx]) { - dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(pair.first) << ", " << pair.second - << " registers\n"; - } - dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() - << " item\n"; - for (const auto &pair : Invariant) { - dbgs() << "LV(REG): RegisterClass: " - << TTI.getRegisterClassName(pair.first) << ", " << pair.second - << " registers\n"; - } - }); - - RU.LoopInvariantRegs = Invariant; - RU.MaxLocalUsers = MaxUsages[Idx]; - RUs[Idx] = RU; - } - - return RUs; -} - unsigned LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, InstructionCost LoopCost) { @@ -5002,8 +4726,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, return 1; } - RegisterUsage R = - ::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0]; + VPRegisterUsage R = + calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -7380,7 +7104,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { for (auto &P : VPlans) { ArrayRef VFs(P->vectorFactors().begin(), P->vectorFactors().end()); - auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore); + auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (auto [VF, RU] : zip_equal(VFs, RUs)) { if (VF.isScalar()) continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 926490bfad7d0..e028497249f2e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -10,8 +10,10 @@ #include "VPlan.h" #include "VPlanCFG.h" #include "VPlanDominatorTree.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/GenericDomTreeConstruction.h" @@ -384,3 +386,252 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, #endif return Base::properlyDominates(ParentA, ParentB); } + +/// Get the VF scaling factor applied to the recipe's output, if the recipe has +/// one. +static unsigned getVFScaleFactor(VPRecipeBase *R) { + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + return 1; +} + +bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { + return any_of(MaxLocalUsers, [&TTI](auto &LU) { + return LU.second > TTI.getNumberOfRegisters(LU.first); + }); +} + +SmallVector llvm::calculateRegisterUsageForPlan( + VPlan &Plan, ArrayRef VFs, const TargetTransformInfo &TTI, + const SmallPtrSetImpl &ValuesToIgnore) { + // Each 'key' in the map opens a new interval. The values + // of the map are the index of the 'last seen' usage of the + // recipe that is the key. + using IntervalMap = SmallDenseMap; + + // Maps indices to recipes. + SmallVector Idx2Recipe; + // Marks the end of each interval. + IntervalMap EndPoint; + // Saves the list of recipe indices that are used in the loop. + SmallPtrSet Ends; + // Saves the list of values that are used in the loop but are defined outside + // the loop (not including non-recipe values such as arguments and + // constants). + SmallSetVector LoopInvariants; + LoopInvariants.insert(&Plan.getVectorTripCount()); + + // We scan the loop in a topological order in order and assign a number to + // each recipe. We use RPO to ensure that defs are met before their users. We + // assume that each recipe that has in-loop users starts an interval. We + // record every time that an in-loop value is used, so we have a list of the + // first and last occurrences of each recipe. + ReversePostOrderTraversal> RPOT( + Plan.getVectorLoopRegion()); + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(RPOT)) { + if (!VPBB->getParent()) + break; + for (VPRecipeBase &R : *VPBB) { + Idx2Recipe.push_back(&R); + + // Save the end location of each USE. + for (VPValue *U : R.operands()) { + auto *DefR = U->getDefiningRecipe(); + + // Ignore non-recipe values such as arguments, constants, etc. + // FIXME: Might need some motivation why these values are ignored. If + // for example an argument is used inside the loop it will increase the + // register pressure (so shouldn't we add it to LoopInvariants). + if (!DefR && (!U->getLiveInIRValue() || + !isa(U->getLiveInIRValue()))) + continue; + + // If this recipe is outside the loop then record it and continue. + if (!DefR) { + LoopInvariants.insert(U); + continue; + } + + // Overwrite previous end points. + EndPoint[DefR] = Idx2Recipe.size(); + Ends.insert(DefR); + } + } + if (VPBB == Plan.getVectorLoopRegion()->getExiting()) { + // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the + // exiting block, where their increment will get materialized eventually. + for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { + if (isa(&R)) { + EndPoint[&R] = Idx2Recipe.size(); + Ends.insert(&R); + } + } + } + } + + // Saves the list of intervals that end with the index in 'key'. + using RecipeList = SmallVector; + SmallDenseMap TransposeEnds; + + // Next, we transpose the EndPoints into a multi map that holds the list of + // intervals that *end* at a specific location. + for (auto &Interval : EndPoint) + TransposeEnds[Interval.second].push_back(Interval.first); + + SmallPtrSet OpenIntervals; + SmallVector RUs(VFs.size()); + SmallVector, 8> MaxUsages(VFs.size()); + + LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n"); + + VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType()); + + const auto &TTICapture = TTI; + auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned { + if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) || + (VF.isScalable() && + !TTICapture.isElementTypeLegalForScalableVector(Ty))) + return 0; + return TTICapture.getRegUsageForType(VectorType::get(Ty, VF)); + }; + + // We scan the instructions linearly and record each time that a new interval + // starts, by placing it in a set. If we find this value in TransposEnds then + // we remove it from the set. The max register usage is the maximum register + // usage of the recipes of the set. + for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) { + VPRecipeBase *R = Idx2Recipe[Idx]; + + // Remove all of the recipes that end at this location. + RecipeList &List = TransposeEnds[Idx]; + for (VPRecipeBase *ToRemove : List) + OpenIntervals.erase(ToRemove); + + // Ignore recipes that are never used within the loop and do not have side + // effects. + if (!Ends.count(R) && !R->mayHaveSideEffects()) + continue; + + // Skip recipes for ignored values. + // TODO: Should mark recipes for ephemeral values that cannot be removed + // explictly in VPlan. + if (isa(R) && + ValuesToIgnore.contains( + cast(R)->getUnderlyingValue())) + continue; + + // For each VF find the maximum usage of registers. + for (unsigned J = 0, E = VFs.size(); J < E; ++J) { + // Count the number of registers used, per register class, given all open + // intervals. + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. + SmallMapVector RegUsage; + + for (auto *R : OpenIntervals) { + // Skip recipes that weren't present in the original loop. + // TODO: Remove after removing the legacy + // LoopVectorizationCostModel::calculateRegisterUsage + if (isa(R)) + continue; + + if (VFs[J].isScalar() || + isa(R) || + (isa(R) && + all_of(cast(R)->users(), + [&](VPUser *U) { + return cast(U)->usesScalars( + R->getVPSingleValue()); + })) || + (isa(R) && + (cast(R))->isInLoop())) { + unsigned ClassID = TTI.getRegisterClassForType( + false, TypeInfo.inferScalarType(R->getVPSingleValue())); + // FIXME: The target might use more than one register for the type + // even in the scalar case. + RegUsage[ClassID] += 1; + } else { + // The output from scaled phis and scaled reductions actually has + // fewer lanes than the VF. + unsigned ScaleFactor = getVFScaleFactor(R); + ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); + LLVM_DEBUG(if (VF != VFs[J]) { + dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF + << " for " << *R << "\n"; + }); + + for (VPValue *DefV : R->definedValues()) { + Type *ScalarTy = TypeInfo.inferScalarType(DefV); + unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy); + RegUsage[ClassID] += GetRegUsage(ScalarTy, VF); + } + } + } + + for (const auto &Pair : RegUsage) { + auto &Entry = MaxUsages[J][Pair.first]; + Entry = std::max(Entry, Pair.second); + } + } + + LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # " + << OpenIntervals.size() << '\n'); + + // Add the current recipe to the list of open intervals. + OpenIntervals.insert(R); + } + + // We also search for instructions that are defined outside the loop, but are + // used inside the loop. We need this number separately from the max-interval + // usage number because when we unroll, loop-invariant values do not take + // more register. + VPRegisterUsage RU; + for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) { + // Note that elements in this SmallMapVector will be default constructed + // as 0. So we can use "Invariant[ClassID] += n" in the code below even if + // there is no previous entry for ClassID. + SmallMapVector Invariant; + + for (auto *In : LoopInvariants) { + // FIXME: The target might use more than one register for the type + // even in the scalar case. + bool IsScalar = all_of(In->users(), [&](VPUser *U) { + return cast(U)->usesScalars(In); + }); + + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx]; + unsigned ClassID = TTI.getRegisterClassForType( + VF.isVector(), TypeInfo.inferScalarType(In)); + Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF); + } + + LLVM_DEBUG({ + dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n'; + dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size() + << " item\n"; + for (const auto &pair : MaxUsages[Idx]) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + dbgs() << "LV(REG): Found invariant usage: " << Invariant.size() + << " item\n"; + for (const auto &pair : Invariant) { + dbgs() << "LV(REG): RegisterClass: " + << TTI.getRegisterClassName(pair.first) << ", " << pair.second + << " registers\n"; + } + }); + + RU.LoopInvariantRegs = Invariant; + RU.MaxLocalUsers = MaxUsages[Idx]; + RUs[Idx] = RU; + } + + return RUs; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index 941e13959c23b..7bcf9dba8c311 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -11,6 +11,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/IR/Type.h" namespace llvm { @@ -27,6 +28,8 @@ struct VPWidenSelectRecipe; class VPReplicateRecipe; class VPRecipeBase; class VPlan; +class Value; +class TargetTransformInfo; class Type; /// An analysis for type-inference for VPValues. @@ -70,6 +73,30 @@ class VPTypeAnalysis { // Collect a VPlan's ephemeral recipes (those used only by an assume). void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet &EphRecipes); + +/// A struct that represents some properties of the register usage +/// of a loop. +struct VPRegisterUsage { + /// Holds the number of loop invariant values that are used in the loop. + /// The key is ClassID of target-provided register class. + SmallMapVector LoopInvariantRegs; + /// Holds the maximum number of concurrent live intervals in the loop. + /// The key is ClassID of target-provided register class. + SmallMapVector MaxLocalUsers; + + /// Check if any of the tracked live intervals exceeds the number of + /// available registers for the target. + bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const; +}; + +/// Estimate the register usage for \p Plan and vectorization factors in \p VFs +/// by calculating the highest number of values that are live at a single +/// location as a rough estimate. Returns the register usage for each VF in \p +/// VFs. +SmallVector calculateRegisterUsageForPlan( + VPlan &Plan, ArrayRef VFs, const TargetTransformInfo &TTI, + const SmallPtrSetImpl &ValuesToIgnore); + } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll index 0ec90b75002cd..8c0fc6104e9aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output -prefer-predicate-over-epilogue=scalar-epilogue 2>&1 < %s | FileCheck %s +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize,vplan -disable-output -prefer-predicate-over-epilogue=scalar-epilogue 2>&1 < %s | FileCheck %s ; REQUIRES: asserts target triple = "aarch64" diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll index c5b2be33cae85..e51a925040a49 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -mtriple arm64-linux -passes=loop-vectorize -mattr=+sve -debug-only=loop-vectorize -disable-output <%s 2>&1 | FileCheck %s +; RUN: opt -mtriple arm64-linux -passes=loop-vectorize -mattr=+sve -debug-only=loop-vectorize,vplan -disable-output <%s 2>&1 | FileCheck %s ; Invariant register usage calculation should take into account if the ; invariant would be used in widened instructions. Only in such cases, a vector diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll index 5baf1e013a50f..de49337c185ac 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll @@ -1,9 +1,9 @@ ; REQUIRES: asserts ; RUN: opt --passes=loop-vectorize --mtriple loongarch64-linux-gnu \ -; RUN: --mattr=+lsx -debug-only=loop-vectorize --force-vector-width=1 \ +; RUN: --mattr=+lsx -debug-only=loop-vectorize,vplan --force-vector-width=1 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALAR ; RUN: opt --passes=loop-vectorize --mtriple loongarch64-linux-gnu \ -; RUN: --mattr=+lsx -debug-only=loop-vectorize --force-vector-width=4 \ +; RUN: --mattr=+lsx -debug-only=loop-vectorize,vplan --force-vector-width=4 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-VECTOR define void @bar(ptr %A, i32 signext %n) { diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll index 280b3af04a4db..2c2a60ecc47b9 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default' -vectorizer-maximize-bandwidth -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 -; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default' -vectorizer-maximize-bandwidth -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9 +; RUN: opt < %s -debug-only=loop-vectorize,vplan -passes='function(loop-vectorize),default' -vectorizer-maximize-bandwidth -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8 +; RUN: opt < %s -debug-only=loop-vectorize,vplan -passes='function(loop-vectorize),default' -vectorizer-maximize-bandwidth -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9 ; REQUIRES: asserts @a = global [1024 x i8] zeroinitializer, align 16 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll index 8de1beea8e57e..cb071f989dafa 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll @@ -1,5 +1,5 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll index 2005e82e9f27a..15facfc48137b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -debug-only=loop-vectorize --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH -; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -debug-only=loop-vectorize --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfh -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFH +; RUN: opt -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -debug-only=loop-vectorize,vplan --disable-output -riscv-v-register-bit-width-lmul=1 -S < %s 2>&1 | FileCheck %s --check-prefix=ZVFHMIN define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) { ; CHECK-LABEL: add diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll index 15665fbd9e315..870f52876c5a9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll @@ -1,22 +1,22 @@ ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+d -debug-only=loop-vectorize --disable-output \ +; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ ; RUN: -riscv-v-vector-bits-min=128 -force-vector-width=1 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-SCALAR ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+d -debug-only=loop-vectorize --disable-output \ +; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=1 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL1 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+d -debug-only=loop-vectorize --disable-output \ +; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=2 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL2 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+d -debug-only=loop-vectorize --disable-output \ +; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=4 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL4 ; RUN: opt -passes=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+d -debug-only=loop-vectorize --disable-output \ +; RUN: -mattr=+v,+d -debug-only=loop-vectorize,vplan --disable-output \ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=8 \ ; RUN: -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-LMUL8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 2e461ec658e63..fd528bacd0e09 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -7,7 +7,7 @@ ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v -debug-only=loop-vectorize -scalable-vectorization=on \ +; RUN: -mattr=+v -debug-only=loop-vectorize,vplan -scalable-vectorization=on \ ; RUN: -riscv-v-vector-bits-min=128 -disable-output < %s 2>&1 | FileCheck %s define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { @@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK: LV: Loop does not require scalar epilogue ; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom @@ -294,7 +294,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 ; CHECK-NEXT: LV: Using user VF vscale x 4. -; CHECK-NEXT: LV: Loop does not require scalar epilogue +; CHECK: LV: Loop does not require scalar epilogue ; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 ; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 ; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll index 3445d4ceff5ec..84a48dba6ae4b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 < %s | FileCheck %s +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize,vplan -disable-output 2>&1 < %s | FileCheck %s ; REQUIRES: asserts target triple = "x86_64" diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll index 164188db6ccf9..530ff175c81ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage-debug.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -debug-only=loop-vectorize -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s +; RUN: opt < %s -debug-only=loop-vectorize,vplan -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s ; REQUIRES: asserts ; Test that the register usage estimation is not affected by the presence of diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll index 3cf44947ea462..98995994cc98b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -debug-only=loop-vectorize -passes=loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s -; RUN: opt < %s -debug-only=loop-vectorize -passes=loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F +; RUN: opt < %s -debug-only=loop-vectorize,vplan -passes=loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s +; RUN: opt < %s -debug-only=loop-vectorize,vplan -passes=loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F ; REQUIRES: asserts @a = global [1024 x i8] zeroinitializer, align 16