From 97cb72bbe16a2ee208c65923f2d71f1b5ab69427 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Tue, 28 Jan 2025 11:36:03 +0000 Subject: [PATCH 1/2] [LoopVectorize][NFC] Cache the result of getVScaleForTuning We currently call getVScaleForTuning in many places, doing a lof of work asking the same question with the same answer. I've refactored the code to cache the value if the max scalable VF != 0 and pull out the cached value from LoopVectorizationCostModel. --- .../Transforms/Vectorize/LoopVectorize.cpp | 80 +++++++++++-------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f592e5557c17d..79cebed1e52e4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1554,9 +1554,32 @@ class LoopVectorizationCostModel { /// trivially hoistable. bool shouldConsiderInvariant(Value *Op); + /// Return the value of vscale used for tuning the cost model. + std::optional getVScaleForTuning() const { return VScaleForTuning; } + private: unsigned NumPredStores = 0; + std::optional VScaleForTuning; + + /// Initializes the value of vscale used for tuning the cost model. If + /// vscale_range.min == vscale_range.max then return vscale_range.max, else + /// return the value returned by the corresponding TTI method. + void initializeVScaleForTuning() { + const Function *Fn = TheLoop->getHeader()->getParent(); + if (Fn->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + auto Max = Attr.getVScaleRangeMax(); + if (Max && Min == Max) { + VScaleForTuning = Max; + return; + } + } + + VScaleForTuning = TTI.getVScaleForTuning(); + } + /// \return An upper bound for the vectorization factors for both /// fixed and scalable vectorization, where the minimum-known number of /// elements is a power-of-2 larger than zero. If scalable vectorization is @@ -3838,6 +3861,11 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( if (!Legal->isSafeForAnyVectorWidth()) this->MaxSafeElements = MaxSafeElements; + if (MaxSafeScalableVF != ElementCount::getScalable(0)) { + // Cache the value of vscale for tuning, since we'll need it. + initializeVScaleForTuning(); + } + LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF @@ -4231,33 +4259,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( return MaxVF; } -/// Convenience function that returns the value of vscale_range iff -/// vscale_range.min == vscale_range.max or otherwise returns the value -/// returned by the corresponding TTI method. -static std::optional -getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { - const Function *Fn = L->getHeader()->getParent(); - if (Fn->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); - auto Min = Attr.getVScaleRangeMin(); - auto Max = Attr.getVScaleRangeMax(); - if (Max && Min == Max) - return Max; - } - - return TTI.getVScaleForTuning(); -} - /// This function attempts to return a value that represents the vectorization /// factor at runtime. For fixed-width VFs we know this precisely at compile /// time, but for scalable VFs we calculate it based on an estimate of the /// vscale value. -static unsigned getEstimatedRuntimeVF(const Loop *L, - const TargetTransformInfo &TTI, - ElementCount VF) { +static unsigned getEstimatedRuntimeVF(ElementCount VF, + std::optional VScale) { unsigned EstimatedVF = VF.getKnownMinValue(); if (VF.isScalable()) - if (std::optional VScale = getVScaleForTuning(L, TTI)) + if (VScale) EstimatedVF *= *VScale; assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1"); return EstimatedVF; @@ -4272,7 +4282,7 @@ bool LoopVectorizationPlanner::isMoreProfitable( // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional VScale = getVScaleForTuning(OrigLoop, TTI)) { + if (std::optional VScale = CM.getVScaleForTuning()) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) @@ -4565,13 +4575,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { InstructionCost C = CM.expectedCost(VF); VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); - unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width); + unsigned Width = + getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF << " costs: " << (Candidate.Cost / Width)); if (VF.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << getVScaleForTuning(OrigLoop, TTI).value_or(1) - << ")"); + << CM.getVScaleForTuning().value_or(1) << ")"); LLVM_DEBUG(dbgs() << ".\n"); if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) { @@ -4660,7 +4670,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 ? EpilogueVectorizationMinVF : TTI.getEpilogueVectorizationMinVF(); - return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold; + return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= + MinVFThreshold; } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( @@ -4712,8 +4723,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. - ElementCount EstimatedRuntimeVF = - ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF)); + ElementCount EstimatedRuntimeVF = ElementCount::getFixed( + getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); ScalarEvolution &SE = *PSE.getSE(); Type *TCType = Legal->getWidestInductionType(); @@ -4959,7 +4970,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; } - unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF); + unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop); if (KnownTC > 0) { // At least one iteration must be scalar when this constraint holds. So the @@ -7388,7 +7399,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); #ifndef NDEBUG - unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF); + unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << " (Estimated cost per lane: "); if (Cost.isValid()) { @@ -10033,9 +10044,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, - const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, - ScalarEpilogueLowering SEL) { + ScalarEpilogueLowering SEL, + std::optional VScale) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) return false; @@ -10085,7 +10096,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width); + unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); uint64_t RtC = *CheckCost.getValue(); uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue(); uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); @@ -10522,7 +10533,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) { + !areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL, + CM.getVScaleForTuning())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), From 27f0eb399bbb76864b26c024f216242ab6f21503 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 30 Jan 2025 11:50:00 +0000 Subject: [PATCH 2/2] Address review comments --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 79cebed1e52e4..49344bd944da1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -978,7 +978,10 @@ class LoopVectorizationCostModel { InterleavedAccessInfo &IAI) : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {} + Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) { + if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors) + initializeVScaleForTuning(); + } /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be @@ -1560,6 +1563,8 @@ class LoopVectorizationCostModel { private: unsigned NumPredStores = 0; + /// Used to store the value of vscale used for tuning the cost model. It is + /// initialized during object construction. std::optional VScaleForTuning; /// Initializes the value of vscale used for tuning the cost model. If @@ -3861,11 +3866,6 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF( if (!Legal->isSafeForAnyVectorWidth()) this->MaxSafeElements = MaxSafeElements; - if (MaxSafeScalableVF != ElementCount::getScalable(0)) { - // Cache the value of vscale for tuning, since we'll need it. - initializeVScaleForTuning(); - } - LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF << ".\n"); LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF