|
93 | 93 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
94 | 94 | #include "llvm/Analysis/ScalarEvolution.h" |
95 | 95 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 96 | +#include "llvm/Analysis/ScalarEvolutionPatternMatch.h" |
96 | 97 | #include "llvm/Analysis/TargetLibraryInfo.h" |
97 | 98 | #include "llvm/Analysis/TargetTransformInfo.h" |
98 | 99 | #include "llvm/Analysis/ValueTracking.h" |
|
155 | 156 | #include <utility> |
156 | 157 |
|
157 | 158 | using namespace llvm; |
| 159 | +using namespace SCEVPatternMatch; |
158 | 160 |
|
159 | 161 | #define LV_NAME "loop-vectorize" |
160 | 162 | #define DEBUG_TYPE LV_NAME |
@@ -418,7 +420,24 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { |
418 | 420 | /// ElementCount to include loops whose trip count is a function of vscale. |
419 | 421 | static ElementCount getSmallConstantTripCount(ScalarEvolution *SE, |
420 | 422 | const Loop *L) { |
421 | | - return ElementCount::getFixed(SE->getSmallConstantTripCount(L)); |
| 423 | + if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L)) |
| 424 | + return ElementCount::getFixed(ExpectedTC); |
| 425 | + |
| 426 | + const SCEV *BTC = SE->getBackedgeTakenCount(L); |
| 427 | + if (isa<SCEVCouldNotCompute>(BTC)) |
| 428 | + return ElementCount::getFixed(0); |
| 429 | + |
| 430 | + const SCEV *ExitCount = SE->getTripCountFromExitCount(BTC, BTC->getType(), L); |
| 431 | + if (isa<SCEVVScale>(ExitCount)) |
| 432 | + return ElementCount::getScalable(1); |
| 433 | + |
| 434 | + const APInt *Scale; |
| 435 | + if (match(ExitCount, m_scev_Mul(m_scev_APInt(Scale), m_SCEVVScale()))) |
| 436 | + if (cast<SCEVMulExpr>(ExitCount)->hasNoUnsignedWrap()) |
| 437 | + if (Scale->getActiveBits() <= 32) |
| 438 | + return ElementCount::getScalable(Scale->getZExtValue()); |
| 439 | + |
| 440 | + return ElementCount::getFixed(0); |
422 | 441 | } |
423 | 442 |
|
424 | 443 | /// Returns "best known" trip count, which is either a valid positive trip count |
@@ -2593,12 +2612,12 @@ static void cse(BasicBlock *BB) { |
2593 | 2612 | } |
2594 | 2613 | } |
2595 | 2614 |
|
2596 | | -/// This function attempts to return a value that represents the vectorization |
2597 | | -/// factor at runtime. For fixed-width VFs we know this precisely at compile |
| 2615 | +/// This function attempts to return a value that represents the ElementCount |
| 2616 | +/// at runtime. For fixed-width VFs we know this precisely at compile |
2598 | 2617 | /// time, but for scalable VFs we calculate it based on an estimate of the |
2599 | 2618 | /// vscale value. |
2600 | | -static unsigned getEstimatedRuntimeVF(ElementCount VF, |
2601 | | - std::optional<unsigned> VScale) { |
| 2619 | +static unsigned estimateElementCount(ElementCount VF, |
| 2620 | + std::optional<unsigned> VScale) { |
2602 | 2621 | unsigned EstimatedVF = VF.getKnownMinValue(); |
2603 | 2622 | if (VF.isScalable()) |
2604 | 2623 | if (VScale) |
@@ -2708,7 +2727,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { |
2708 | 2727 | // use the value of vscale used for tuning. |
2709 | 2728 | Loop *VectorLoop = LI->getLoopFor(HeaderBB); |
2710 | 2729 | unsigned EstimatedVFxUF = |
2711 | | - getEstimatedRuntimeVF(VF * UF, Cost->getVScaleForTuning()); |
| 2730 | + estimateElementCount(VF * UF, Cost->getVScaleForTuning()); |
2712 | 2731 | setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF); |
2713 | 2732 | } |
2714 | 2733 |
|
@@ -4337,7 +4356,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { |
4337 | 4356 |
|
4338 | 4357 | VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); |
4339 | 4358 | unsigned Width = |
4340 | | - getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning()); |
| 4359 | + estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); |
4341 | 4360 | LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF |
4342 | 4361 | << " costs: " << (Candidate.Cost / Width)); |
4343 | 4362 | if (VF.isScalable()) |
@@ -4445,7 +4464,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( |
4445 | 4464 | unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0 |
4446 | 4465 | ? EpilogueVectorizationMinVF |
4447 | 4466 | : TTI.getEpilogueVectorizationMinVF(); |
4448 | | - return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >= |
| 4467 | + return estimateElementCount(VF * Multiplier, VScaleForTuning) >= |
4449 | 4468 | MinVFThreshold; |
4450 | 4469 | } |
4451 | 4470 |
|
@@ -4498,7 +4517,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( |
4498 | 4517 | // the main loop handles 8 lanes per iteration. We could still benefit from |
4499 | 4518 | // vectorizing the epilogue loop with VF=4. |
4500 | 4519 | ElementCount EstimatedRuntimeVF = ElementCount::getFixed( |
4501 | | - getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning())); |
| 4520 | + estimateElementCount(MainLoopVF, CM.getVScaleForTuning())); |
4502 | 4521 |
|
4503 | 4522 | ScalarEvolution &SE = *PSE.getSE(); |
4504 | 4523 | Type *TCType = Legal->getWidestInductionType(); |
@@ -4745,16 +4764,20 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF, |
4745 | 4764 | MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor; |
4746 | 4765 | } |
4747 | 4766 |
|
4748 | | - unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning); |
4749 | | - |
4750 | 4767 | // Try to get the exact trip count, or an estimate based on profiling data or |
4751 | 4768 | // ConstantMax from PSE, failing that. |
4752 | | - if (auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop)) { |
| 4769 | + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); |
| 4770 | + |
| 4771 | + // For fixed length VFs treat a scalable trip count as unknown. |
| 4772 | + if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) { |
| 4773 | + // Re-evaluate trip counts and VFs to be in the same numerical space. |
| 4774 | + unsigned AvailableTC = estimateElementCount(*BestKnownTC, VScaleForTuning); |
| 4775 | + unsigned EstimatedVF = estimateElementCount(VF, VScaleForTuning); |
| 4776 | + |
4753 | 4777 | // At least one iteration must be scalar when this constraint holds. So the |
4754 | 4778 | // maximum available iterations for interleaving is one less. |
4755 | | - unsigned AvailableTC = requiresScalarEpilogue(VF.isVector()) |
4756 | | - ? BestKnownTC->getFixedValue() - 1 |
4757 | | - : BestKnownTC->getFixedValue(); |
| 4779 | + if (requiresScalarEpilogue(VF.isVector())) |
| 4780 | + --AvailableTC; |
4758 | 4781 |
|
4759 | 4782 | unsigned InterleaveCountLB = bit_floor(std::max( |
4760 | 4783 | 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount))); |
@@ -6925,7 +6948,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, |
6925 | 6948 | // Now compute and add the VPlan-based cost. |
6926 | 6949 | Cost += Plan.cost(VF, CostCtx); |
6927 | 6950 | #ifndef NDEBUG |
6928 | | - unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning()); |
| 6951 | + unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); |
6929 | 6952 | LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost |
6930 | 6953 | << " (Estimated cost per lane: "); |
6931 | 6954 | if (Cost.isValid()) { |
@@ -9611,7 +9634,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks, |
9611 | 9634 | // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that |
9612 | 9635 | // the computations are performed on doubles, not integers and the result |
9613 | 9636 | // is rounded up, hence we get an upper estimate of the TC. |
9614 | | - unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale); |
| 9637 | + unsigned IntVF = estimateElementCount(VF.Width, VScale); |
9615 | 9638 | uint64_t RtC = TotalCost.getValue(); |
9616 | 9639 | uint64_t Div = ScalarC * IntVF - VF.Cost.getValue(); |
9617 | 9640 | uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div); |
|
0 commit comments