Skip to content

Commit 97cb72b

Browse files
committed
[LoopVectorize][NFC] Cache the result of getVScaleForTuning
We currently call getVScaleForTuning in many places, doing a lof of work asking the same question with the same answer. I've refactored the code to cache the value if the max scalable VF != 0 and pull out the cached value from LoopVectorizationCostModel.
1 parent 2625510 commit 97cb72b

File tree

1 file changed

+46
-34
lines changed

1 file changed

+46
-34
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,9 +1554,32 @@ class LoopVectorizationCostModel {
15541554
/// trivially hoistable.
15551555
bool shouldConsiderInvariant(Value *Op);
15561556

1557+
/// Return the value of vscale used for tuning the cost model.
1558+
std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1559+
15571560
private:
15581561
unsigned NumPredStores = 0;
15591562

1563+
std::optional<unsigned> VScaleForTuning;
1564+
1565+
/// Initializes the value of vscale used for tuning the cost model. If
1566+
/// vscale_range.min == vscale_range.max then return vscale_range.max, else
1567+
/// return the value returned by the corresponding TTI method.
1568+
void initializeVScaleForTuning() {
1569+
const Function *Fn = TheLoop->getHeader()->getParent();
1570+
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1571+
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1572+
auto Min = Attr.getVScaleRangeMin();
1573+
auto Max = Attr.getVScaleRangeMax();
1574+
if (Max && Min == Max) {
1575+
VScaleForTuning = Max;
1576+
return;
1577+
}
1578+
}
1579+
1580+
VScaleForTuning = TTI.getVScaleForTuning();
1581+
}
1582+
15601583
/// \return An upper bound for the vectorization factors for both
15611584
/// fixed and scalable vectorization, where the minimum-known number of
15621585
/// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -3838,6 +3861,11 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
38383861
if (!Legal->isSafeForAnyVectorWidth())
38393862
this->MaxSafeElements = MaxSafeElements;
38403863

3864+
if (MaxSafeScalableVF != ElementCount::getScalable(0)) {
3865+
// Cache the value of vscale for tuning, since we'll need it.
3866+
initializeVScaleForTuning();
3867+
}
3868+
38413869
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
38423870
<< ".\n");
38433871
LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
@@ -4231,33 +4259,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
42314259
return MaxVF;
42324260
}
42334261

4234-
/// Convenience function that returns the value of vscale_range iff
4235-
/// vscale_range.min == vscale_range.max or otherwise returns the value
4236-
/// returned by the corresponding TTI method.
4237-
static std::optional<unsigned>
4238-
getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4239-
const Function *Fn = L->getHeader()->getParent();
4240-
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4241-
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4242-
auto Min = Attr.getVScaleRangeMin();
4243-
auto Max = Attr.getVScaleRangeMax();
4244-
if (Max && Min == Max)
4245-
return Max;
4246-
}
4247-
4248-
return TTI.getVScaleForTuning();
4249-
}
4250-
42514262
/// This function attempts to return a value that represents the vectorization
42524263
/// factor at runtime. For fixed-width VFs we know this precisely at compile
42534264
/// time, but for scalable VFs we calculate it based on an estimate of the
42544265
/// vscale value.
4255-
static unsigned getEstimatedRuntimeVF(const Loop *L,
4256-
const TargetTransformInfo &TTI,
4257-
ElementCount VF) {
4266+
static unsigned getEstimatedRuntimeVF(ElementCount VF,
4267+
std::optional<unsigned> VScale) {
42584268
unsigned EstimatedVF = VF.getKnownMinValue();
42594269
if (VF.isScalable())
4260-
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4270+
if (VScale)
42614271
EstimatedVF *= *VScale;
42624272
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
42634273
return EstimatedVF;
@@ -4272,7 +4282,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42724282
// Improve estimate for the vector width if it is scalable.
42734283
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
42744284
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4275-
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4285+
if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
42764286
if (A.Width.isScalable())
42774287
EstimatedWidthA *= *VScale;
42784288
if (B.Width.isScalable())
@@ -4565,13 +4575,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45654575
InstructionCost C = CM.expectedCost(VF);
45664576
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
45674577

4568-
unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4578+
unsigned Width =
4579+
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
45694580
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
45704581
<< " costs: " << (Candidate.Cost / Width));
45714582
if (VF.isScalable())
45724583
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4573-
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
4574-
<< ")");
4584+
<< CM.getVScaleForTuning().value_or(1) << ")");
45754585
LLVM_DEBUG(dbgs() << ".\n");
45764586

45774587
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4660,7 +4670,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46604670
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
46614671
? EpilogueVectorizationMinVF
46624672
: TTI.getEpilogueVectorizationMinVF();
4663-
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4673+
return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4674+
MinVFThreshold;
46644675
}
46654676

46664677
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4712,8 +4723,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47124723
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47134724
// the main loop handles 8 lanes per iteration. We could still benefit from
47144725
// vectorizing the epilogue loop with VF=4.
4715-
ElementCount EstimatedRuntimeVF =
4716-
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4726+
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4727+
getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
47174728

47184729
ScalarEvolution &SE = *PSE.getSE();
47194730
Type *TCType = Legal->getWidestInductionType();
@@ -4959,7 +4970,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49594970
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49604971
}
49614972

4962-
unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4973+
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
49634974
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
49644975
if (KnownTC > 0) {
49654976
// At least one iteration must be scalar when this constraint holds. So the
@@ -7388,7 +7399,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
73887399
// Now compute and add the VPlan-based cost.
73897400
Cost += Plan.cost(VF, CostCtx);
73907401
#ifndef NDEBUG
7391-
unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7402+
unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
73927403
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
73937404
<< " (Estimated cost per lane: ");
73947405
if (Cost.isValid()) {
@@ -10033,9 +10044,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1003310044

1003410045
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1003510046
VectorizationFactor &VF, Loop *L,
10036-
const TargetTransformInfo &TTI,
1003710047
PredicatedScalarEvolution &PSE,
10038-
ScalarEpilogueLowering SEL) {
10048+
ScalarEpilogueLowering SEL,
10049+
std::optional<unsigned> VScale) {
1003910050
InstructionCost CheckCost = Checks.getCost();
1004010051
if (!CheckCost.isValid())
1004110052
return false;
@@ -10085,7 +10096,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1008510096
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1008610097
// the computations are performed on doubles, not integers and the result
1008710098
// is rounded up, hence we get an upper estimate of the TC.
10088-
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10099+
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
1008910100
uint64_t RtC = *CheckCost.getValue();
1009010101
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1009110102
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10522,7 +10533,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1052210533
bool ForceVectorization =
1052310534
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1052410535
if (!ForceVectorization &&
10525-
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10536+
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10537+
CM.getVScaleForTuning())) {
1052610538
ORE->emit([&]() {
1052710539
return OptimizationRemarkAnalysisAliasing(
1052810540
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

0 commit comments

Comments
 (0)