diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 92304edd67a44..52ab38583d5de 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -183,10 +183,12 @@ class MemoryDepChecker { MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC, DominatorTree *DT, const Loop *L, const DenseMap &SymbolicStrides, - unsigned MaxTargetVectorWidthInBits) + unsigned MaxTargetVectorWidthInBits, + std::optional &LoopGuards) : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L), SymbolicStrides(SymbolicStrides), - MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {} + MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits), + LoopGuards(LoopGuards) {} /// Register the location (instructions are given increasing numbers) /// of a write access. @@ -373,7 +375,7 @@ class MemoryDepChecker { PointerBounds; /// Cache for the loop guards of InnermostLoop. - std::optional LoopGuards; + std::optional &LoopGuards; /// Check whether there is a plausible dependence between the two /// accesses. @@ -531,8 +533,9 @@ class RuntimePointerChecking { AliasSetId(AliasSetId), Expr(Expr), NeedsFreeze(NeedsFreeze) {} }; - RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE) - : DC(DC), SE(SE) {} + RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE, + std::optional &LoopGuards) + : DC(DC), SE(SE), LoopGuards(LoopGuards) {} /// Reset the state of the pointer runtime information. void reset() { @@ -646,6 +649,9 @@ class RuntimePointerChecking { /// Holds a pointer to the ScalarEvolution analysis. ScalarEvolution *SE; + /// Cache for the loop guards of the loop. + std::optional &LoopGuards; + /// Set of run-time checks required to establish independence of /// otherwise may-aliasing pointers in the loop. SmallVector Checks; @@ -821,6 +827,9 @@ class LoopAccessInfo { Loop *TheLoop; + /// Cache for the loop guards of TheLoop. + std::optional LoopGuards; + /// Determines whether we should generate partial runtime checks when not all /// memory accesses could be analyzed. bool AllowPartial; @@ -938,7 +947,8 @@ LLVM_ABI std::pair getStartAndEndForAccess( const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, std::pair> *PointerBounds, - DominatorTree *DT, AssumptionCache *AC); + DominatorTree *DT, AssumptionCache *AC, + std::optional &LoopGuards); class LoopAccessInfoManager { /// The cache. diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 84151584835fc..b6529d9af08a3 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -26,10 +26,6 @@ using namespace llvm; -static cl::opt - UseSymbolicMaxBTCForDerefInLoop("use-symbolic-maxbtc-deref-loop", - cl::init(false)); - static bool isAligned(const Value *Base, Align Alignment, const DataLayout &DL) { return Base->getPointerAlignment(DL) >= Alignment; @@ -335,18 +331,10 @@ bool llvm::isDereferenceableAndAlignedInLoop( : SE.getBackedgeTakenCount(L); if (isa(MaxBECount)) return false; - - if (isa(BECount) && !UseSymbolicMaxBTCForDerefInLoop) { - // TODO: Support symbolic max backedge taken counts for loops without - // computable backedge taken counts. - MaxBECount = - Predicates - ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates) - : SE.getConstantMaxBackedgeTakenCount(L); - } - - const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess( - L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC); + std::optional LoopGuards; + const auto &[AccessStart, AccessEnd] = + getStartAndEndForAccess(L, PtrScev, LI->getType(), BECount, MaxBECount, + &SE, nullptr, &DT, AC, LoopGuards); if (isa(AccessStart) || isa(AccessEnd)) return false; @@ -355,10 +343,13 @@ bool llvm::isDereferenceableAndAlignedInLoop( const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart); if (isa(PtrDiff)) return false; - ScalarEvolution::LoopGuards LoopGuards = - ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE); + + if (!LoopGuards) + LoopGuards.emplace( + ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE)); + APInt MaxPtrDiff = - SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, LoopGuards)); + SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, *LoopGuards)); Value *Base = nullptr; APInt AccessSize; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 258fa982ed1d0..7b2f797311a62 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -193,9 +193,8 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise /// return nullptr. \p A and \p B must have the same type. static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, - ScalarEvolution &SE, - const Instruction *CtxI) { - if (!SE.willNotOverflow(Instruction::Add, /*IsSigned=*/false, A, B, CtxI)) + ScalarEvolution &SE) { + if (!SE.willNotOverflow(Instruction::Add, /*IsSigned=*/false, A, B)) return nullptr; return SE.getAddExpr(A, B); } @@ -203,20 +202,19 @@ static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise /// return nullptr. \p A and \p B must have the same type. static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, - ScalarEvolution &SE, - const Instruction *CtxI) { - if (!SE.willNotOverflow(Instruction::Mul, /*IsSigned=*/false, A, B, CtxI)) + ScalarEvolution &SE) { + if (!SE.willNotOverflow(Instruction::Mul, /*IsSigned=*/false, A, B)) return nullptr; return SE.getMulExpr(A, B); } /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at /// \p MaxBTC is guaranteed inbounds of the accessed object. -static bool -evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, - const SCEV *MaxBTC, const SCEV *EltSize, - ScalarEvolution &SE, const DataLayout &DL, - DominatorTree *DT, AssumptionCache *AC) { +static bool evaluatePtrAddRecAtMaxBTCWillNotWrap( + const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize, + ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT, + AssumptionCache *AC, + std::optional &LoopGuards) { auto *PointerBase = SE.getPointerBase(AR->getStart()); auto *StartPtr = dyn_cast(PointerBase); if (!StartPtr) @@ -234,12 +232,11 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes); - // Context which dominates the entire loop. - auto *CtxI = L->getLoopPredecessor()->getTerminator(); // Check if we have a suitable dereferencable assumption we can use. if (!StartPtrV->canBeFreed()) { RetainedKnowledge DerefRK = getKnowledgeValidInContext( - StartPtrV, {Attribute::Dereferenceable}, *AC, CtxI, DT); + StartPtrV, {Attribute::Dereferenceable}, *AC, + L->getLoopPredecessor()->getTerminator(), DT); if (DerefRK) { DerefBytesSCEV = SE.getUMaxExpr( DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue)); @@ -263,12 +260,20 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy); const SCEV *OffsetAtLastIter = - mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE, CtxI); - if (!OffsetAtLastIter) - return false; + mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE); + if (!OffsetAtLastIter) { + // Re-try with constant max backedge-taken count if using the symbolic one + // failed. + MaxBTC = SE.getNoopOrZeroExtend( + SE.getConstantMaxBackedgeTakenCount(AR->getLoop()), WiderTy); + OffsetAtLastIter = + mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE); + if (!OffsetAtLastIter) + return false; + } const SCEV *OffsetEndBytes = addSCEVNoOverflow( - OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE, CtxI); + OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE); if (!OffsetEndBytes) return false; @@ -276,10 +281,15 @@ evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, // For positive steps, check if // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, // while making sure none of the computations unsigned wrap themselves. - const SCEV *EndBytes = - addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE, CtxI); + const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE); if (!EndBytes) return false; + + if (!LoopGuards) + LoopGuards.emplace( + ScalarEvolution::LoopGuards::collect(AR->getLoop(), SE)); + + EndBytes = SE.applyLoopGuards(EndBytes, *LoopGuards); return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV); } @@ -296,7 +306,8 @@ std::pair llvm::getStartAndEndForAccess( const SCEV *MaxBTC, ScalarEvolution *SE, DenseMap, std::pair> *PointerBounds, - DominatorTree *DT, AssumptionCache *AC) { + DominatorTree *DT, AssumptionCache *AC, + std::optional &LoopGuards) { std::pair *PtrBoundsPair; if (PointerBounds) { auto [Iter, Ins] = PointerBounds->insert( @@ -332,7 +343,7 @@ std::pair llvm::getStartAndEndForAccess( // separately checks that accesses cannot not wrap, so unsigned max // represents an upper bound. if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL, - DT, AC)) { + DT, AC, LoopGuards)) { ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); } else { ScEnd = SE->getAddExpr( @@ -381,7 +392,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, const SCEV *BTC = PSE.getBackedgeTakenCount(); const auto &[ScStart, ScEnd] = getStartAndEndForAccess( Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(), - &DC.getPointerBounds(), DC.getDT(), DC.getAC()); + &DC.getPointerBounds(), DC.getDT(), DC.getAC(), LoopGuards); assert(!isa(ScStart) && !isa(ScEnd) && "must be able to compute both start and end expressions"); @@ -1987,13 +1998,13 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src, ScalarEvolution &SE = *PSE.getSE(); const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, - &SE, &PointerBounds, DT, AC); + &SE, &PointerBounds, DT, AC, LoopGuards); if (isa(SrcStart_) || isa(SrcEnd_)) return false; const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, - &SE, &PointerBounds, DT, AC); + &SE, &PointerBounds, DT, AC, LoopGuards); if (isa(SinkStart_) || isa(SinkEnd_)) return false; @@ -3040,8 +3051,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; DepChecker = std::make_unique( - *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits); - PtrRtChecking = std::make_unique(*DepChecker, SE); + *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards); + PtrRtChecking = + std::make_unique(*DepChecker, SE, LoopGuards); if (canAnalyzeLoop()) CanVecMem = analyzeLoop(AA, LI, TLI, DT); } diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 27e6e7e8806c3..09b126d35bde0 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -2338,23 +2338,15 @@ bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed, // Can we use context to prove the fact we need? if (!CtxI) return false; + // TODO: Support mul. + if (BinOp == Instruction::Mul) + return false; auto *RHSC = dyn_cast(RHS); // TODO: Lift this limitation. if (!RHSC) return false; APInt C = RHSC->getAPInt(); unsigned NumBits = C.getBitWidth(); - if (BinOp == Instruction::Mul) { - // Multiplying by 0 or 1 never overflows - if (C.isZero() || C.isOne()) - return true; - if (Signed) - return false; - APInt Limit = APInt::getMaxValue(NumBits).udiv(C); - // To avoid overflow, we need to make sure that LHS <= MAX / C. - return isKnownPredicateAt(ICmpInst::ICMP_ULE, LHS, getConstant(Limit), - CtxI); - } bool IsSub = (BinOp == Instruction::Sub); bool IsNegativeConst = (Signed && C.isNegative()); // Compute the direction and magnitude by which we need to check overflow. diff --git a/llvm/test/Transforms/LoopVectorize/vect.stats.ll b/llvm/test/Transforms/LoopVectorize/vect.stats.ll index f3695e6712952..e3240c8181519 100644 --- a/llvm/test/Transforms/LoopVectorize/vect.stats.ll +++ b/llvm/test/Transforms/LoopVectorize/vect.stats.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization -use-symbolic-maxbtc-deref-loop --disable-output -stats -S 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization --disable-output -stats -S 2>&1 | FileCheck %s ; REQUIRES: asserts ; We have 3 loops, two of them are vectorizable (with one being early-exit