Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions llvm/include/llvm/Analysis/ScalarEvolution.h
Original file line number Diff line number Diff line change
Expand Up @@ -2376,6 +2376,10 @@ class PredicatedScalarEvolution {
/// Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV *getSymbolicMaxBackedgeTakenCount();

/// Returns the upper bound of the loop trip count as a normal unsigned
/// value, or 0 if the trip count is unknown.
unsigned getSmallConstantMaxTripCount();

/// Adds a new predicate.
void addPredicate(const SCEVPredicate &Pred);

Expand Down Expand Up @@ -2447,6 +2451,9 @@ class PredicatedScalarEvolution {

/// The symbolic backedge taken count.
const SCEV *SymbolicMaxBackedgeCount = nullptr;

/// The constant max trip count for the loop.
std::optional<unsigned> SmallConstantMaxTripCount;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
std::optional<unsigned> SmallConstantMaxTripCount;
const SCEV* SmallConstantMaxTripCount = nullptr;

for consistency with BackedgeCount and SymbolicMaxBackedgeCount above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with that is the max trip count is measured as an unsigned value. I could potentially cache the intermediate result in the function:

unsigned ScalarEvolution::getSmallConstantMaxTripCount(
    const Loop *L, SmallVectorImpl<const SCEVPredicate *> *Predicates) {

  const auto *MaxExitCount =
      Predicates ? getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
                 : getConstantMaxBackedgeTakenCount(L);
  return getConstantTripCount(dyn_cast<SCEVConstant>(MaxExitCount));
}

i.e. MaxExitCount, but then you still have to call getConstantTripCount each time so you see less benefit from caching it. That's why I chose to use std::optional<unsigned> - alternatively I could use a larger type such as uint64_t and treat UINT64_MAX as being equivalent to not cached yet.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see! It's not worth the trouble then I think.

};

template <> struct DenseMapInfo<ScalarEvolution::FoldID> {
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Analysis/ScalarEvolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15051,6 +15051,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() {
return SymbolicMaxBackedgeCount;
}

unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() {
if (!SmallConstantMaxTripCount) {
SmallVector<const SCEVPredicate *, 4> Preds;
SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds);
for (const auto *P : Preds)
addPredicate(*P);
}
return *SmallConstantMaxTripCount;
}

void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) {
if (Preds->implies(&Pred))
return;
Expand Down
48 changes: 26 additions & 22 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
/// 4) Returns std::nullopt if all of the above failed.
static std::optional<unsigned>
getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
bool CanUseConstantMax = true) {
// Check if exact trip count is known.
if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))
return ExpectedTC;

// Check if there is an expected trip count available from profile data.
Expand All @@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L,
return std::nullopt;

// Check if upper bound estimate is known.
if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
return ExpectedTC;

return std::nullopt;
Expand Down Expand Up @@ -1787,12 +1787,15 @@ class GeneratedRTChecks {

Loop *OuterLoop = nullptr;

PredicatedScalarEvolution &PSE;

public:
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
TargetTransformInfo *TTI, const DataLayout &DL,
bool AddBranchWeights)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
const DataLayout &DL, bool AddBranchWeights)
: DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
MemCheckExp(*PSE.getSE(), DL, "scev.check"),
AddBranchWeights(AddBranchWeights), PSE(PSE) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
Expand Down Expand Up @@ -1939,7 +1942,7 @@ class GeneratedRTChecks {

// Get the best known TC estimate.
if (auto EstimatedTC = getSmallBestKnownTC(
*SE, OuterLoop, /* CanUseConstantMax = */ false))
PSE, OuterLoop, /* CanUseConstantMax = */ false))
BestTripCount = *EstimatedTC;

BestTripCount = std::max(BestTripCount, 1U);
Expand Down Expand Up @@ -2270,8 +2273,7 @@ static bool isIndvarOverflowCheckKnownFalse(
// We know the runtime overflow check is known false iff the (max) trip-count
// is known and (max) trip-count + (VF * UF) does not overflow in the type of
// the vector loop induction variable.
if (unsigned TC =
Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) {
if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
uint64_t MaxVF = VF.getKnownMinValue();
if (VF.isScalable()) {
std::optional<unsigned> MaxVScale =
Expand Down Expand Up @@ -3956,8 +3958,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}

unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
if (TC != MaxTC)
LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
if (TC == 1) {
reportVectorizationFailure("Single iteration (non) loop",
"loop trip count is one, irrelevant for vectorization",
Expand Down Expand Up @@ -4251,7 +4255,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
InstructionCost CostA = A.Cost;
InstructionCost CostB = B.Cost;

unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();

// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
Expand Down Expand Up @@ -4839,7 +4843,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (!Legal->isSafeForAnyVectorWidth())
return 1;

auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();

// If we did not calculate the cost for VF (because the user selected the VF)
Expand Down Expand Up @@ -9583,8 +9587,8 @@ static bool processLoopInVPlanNativePath(
{
bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
Expand Down Expand Up @@ -9648,7 +9652,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
std::optional<unsigned> VScale, Loop *L,
ScalarEvolution &SE,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
Expand Down Expand Up @@ -9733,7 +9737,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,

// Skip vectorization if the expected trip count is less than the minimum
// required trip count.
if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
VF.MinProfitableTripCount)) {
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
Expand Down Expand Up @@ -9840,7 +9844,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {

// Check the loop for a trip count threshold: vectorize loops with a tiny trip
// count by optimizing for size, to minimize overheads.
auto ExpectedTC = getSmallBestKnownTC(*SE, L);
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
<< "This loop is worth vectorizing only if no scalar "
Expand Down Expand Up @@ -9938,8 +9942,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {

bool AddBranchWeights =
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
AddBranchWeights);
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
Expand All @@ -9955,7 +9959,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
*PSE.getSE(), SEL)) {
PSE, SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
Expand Down
Loading