Revert "[LAA,Loads] Use loop guards and max BTC if needed when checking deref. (#155672)"

fhahn · fhahn · commit f0df1e3dd4ec · 2025-09-02T21:44:30.000+01:00
This reverts commit 08001cf. This triggers an assertion in some build configs, e.g. https://lab.llvm.org/buildbot/#/builders/24/builds/12211
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -183,12 +183,10 @@ class MemoryDepChecker {
   MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
                    DominatorTree *DT, const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
-                   unsigned MaxTargetVectorWidthInBits,
-                   std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
+                   unsigned MaxTargetVectorWidthInBits)
       : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
         SymbolicStrides(SymbolicStrides),
-        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
-        LoopGuards(LoopGuards) {}
+        MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
 
   /// Register the location (instructions are given increasing numbers)
   /// of a write access.
@@ -375,7 +373,7 @@ class MemoryDepChecker {
       PointerBounds;
 
   /// Cache for the loop guards of InnermostLoop.
-  std::optional<ScalarEvolution::LoopGuards> &LoopGuards;
+  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
 
   /// Check whether there is a plausible dependence between the two
   /// accesses.
@@ -533,9 +531,8 @@ class RuntimePointerChecking {
           AliasSetId(AliasSetId), Expr(Expr), NeedsFreeze(NeedsFreeze) {}
   };
 
-  RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE,
-                         std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
-      : DC(DC), SE(SE), LoopGuards(LoopGuards) {}
+  RuntimePointerChecking(MemoryDepChecker &DC, ScalarEvolution *SE)
+      : DC(DC), SE(SE) {}
 
   /// Reset the state of the pointer runtime information.
   void reset() {
@@ -649,9 +646,6 @@ class RuntimePointerChecking {
   /// Holds a pointer to the ScalarEvolution analysis.
   ScalarEvolution *SE;
 
-  /// Cache for the loop guards of the loop.
-  std::optional<ScalarEvolution::LoopGuards> &LoopGuards;
-
   /// Set of run-time checks required to establish independence of
   /// otherwise may-aliasing pointers in the loop.
   SmallVector<RuntimePointerCheck, 4> Checks;
@@ -827,9 +821,6 @@ class LoopAccessInfo {
 
   Loop *TheLoop;
 
-  /// Cache for the loop guards of TheLoop.
-  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
-
   /// Determines whether we should generate partial runtime checks when not all
   /// memory accesses could be analyzed.
   bool AllowPartial;
@@ -947,8 +938,7 @@ LLVM_ABI std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
              std::pair<const SCEV *, const SCEV *>> *PointerBounds,
-    DominatorTree *DT, AssumptionCache *AC,
-    std::optional<ScalarEvolution::LoopGuards> &LoopGuards);
+    DominatorTree *DT, AssumptionCache *AC);
 
 class LoopAccessInfoManager {
   /// The cache.
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
@@ -26,6 +26,10 @@
 
 using namespace llvm;
 
+static cl::opt<bool>
+    UseSymbolicMaxBTCForDerefInLoop("use-symbolic-maxbtc-deref-loop",
+                                    cl::init(false));
+
 static bool isAligned(const Value *Base, Align Alignment,
                       const DataLayout &DL) {
   return Base->getPointerAlignment(DL) >= Alignment;
@@ -331,10 +335,18 @@ bool llvm::isDereferenceableAndAlignedInLoop(
                             : SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(MaxBECount))
     return false;
-  std::optional<ScalarEvolution::LoopGuards> LoopGuards;
-  const auto &[AccessStart, AccessEnd] =
-      getStartAndEndForAccess(L, PtrScev, LI->getType(), BECount, MaxBECount,
-                              &SE, nullptr, &DT, AC, LoopGuards);
+
+  if (isa<SCEVCouldNotCompute>(BECount) && !UseSymbolicMaxBTCForDerefInLoop) {
+    // TODO: Support symbolic max backedge taken counts for loops without
+    // computable backedge taken counts.
+    MaxBECount =
+        Predicates
+            ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
+            : SE.getConstantMaxBackedgeTakenCount(L);
+  }
+
+  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
+      L, PtrScev, LI->getType(), BECount, MaxBECount, &SE, nullptr, &DT, AC);
   if (isa<SCEVCouldNotCompute>(AccessStart) ||
       isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
@@ -343,13 +355,10 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
   if (isa<SCEVCouldNotCompute>(PtrDiff))
     return false;
-
-  if (!LoopGuards)
-    LoopGuards.emplace(
-        ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE));
-
+  ScalarEvolution::LoopGuards LoopGuards =
+      ScalarEvolution::LoopGuards::collect(AddRec->getLoop(), SE);
   APInt MaxPtrDiff =
-      SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, *LoopGuards));
+      SE.getUnsignedRangeMax(SE.applyLoopGuards(PtrDiff, LoopGuards));
 
   Value *Base = nullptr;
   APInt AccessSize;
@@ -395,7 +404,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
              [&SE, AccessSizeSCEV, &LoopGuards](const RetainedKnowledge &RK) {
                return SE.isKnownPredicate(
                    CmpInst::ICMP_ULE, AccessSizeSCEV,
-                   SE.applyLoopGuards(SE.getSCEV(RK.IRArgValue), *LoopGuards));
+                   SE.applyLoopGuards(SE.getSCEV(RK.IRArgValue), LoopGuards));
              },
              DL, HeaderFirstNonPHI, AC, &DT) ||
          isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -193,28 +193,30 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
 /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise
 /// return nullptr. \p A and \p B must have the same type.
 static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B,
-                                     ScalarEvolution &SE) {
-  if (!SE.willNotOverflow(Instruction::Add, /*IsSigned=*/false, A, B))
+                                     ScalarEvolution &SE,
+                                     const Instruction *CtxI) {
+  if (!SE.willNotOverflow(Instruction::Add, /*IsSigned=*/false, A, B, CtxI))
     return nullptr;
   return SE.getAddExpr(A, B);
 }
 
 /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise
 /// return nullptr. \p A and \p B must have the same type.
 static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B,
-                                   ScalarEvolution &SE) {
-  if (!SE.willNotOverflow(Instruction::Mul, /*IsSigned=*/false, A, B))
+                                   ScalarEvolution &SE,
+                                   const Instruction *CtxI) {
+  if (!SE.willNotOverflow(Instruction::Mul, /*IsSigned=*/false, A, B, CtxI))
     return nullptr;
   return SE.getMulExpr(A, B);
 }
 
 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at
 /// \p MaxBTC is guaranteed inbounds of the accessed object.
-static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
-    const SCEVAddRecExpr *AR, const SCEV *MaxBTC, const SCEV *EltSize,
-    ScalarEvolution &SE, const DataLayout &DL, DominatorTree *DT,
-    AssumptionCache *AC,
-    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
+static bool
+evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR,
+                                     const SCEV *MaxBTC, const SCEV *EltSize,
+                                     ScalarEvolution &SE, const DataLayout &DL,
+                                     DominatorTree *DT, AssumptionCache *AC) {
   auto *PointerBase = SE.getPointerBase(AR->getStart());
   auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase);
   if (!StartPtr)
@@ -232,11 +234,12 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
   Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType());
   const SCEV *DerefBytesSCEV = SE.getConstant(WiderTy, DerefBytes);
 
+  // Context which dominates the entire loop.
+  auto *CtxI = L->getLoopPredecessor()->getTerminator();
   // Check if we have a suitable dereferencable assumption we can use.
   if (!StartPtrV->canBeFreed()) {
     RetainedKnowledge DerefRK = getKnowledgeValidInContext(
-        StartPtrV, {Attribute::Dereferenceable}, *AC,
-        L->getLoopPredecessor()->getTerminator(), DT);
+        StartPtrV, {Attribute::Dereferenceable}, *AC, CtxI, DT);
     if (DerefRK) {
       DerefBytesSCEV = SE.getUMaxExpr(
           DerefBytesSCEV, SE.getConstant(WiderTy, DerefRK.ArgValue));
@@ -260,36 +263,23 @@ static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(
       SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy);
 
   const SCEV *OffsetAtLastIter =
-      mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
-  if (!OffsetAtLastIter) {
-    // Re-try with constant max backedge-taken count if using the symbolic one
-    // failed.
-    MaxBTC = SE.getNoopOrZeroExtend(
-        SE.getConstantMaxBackedgeTakenCount(AR->getLoop()), WiderTy);
-    OffsetAtLastIter =
-        mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE);
-    if (!OffsetAtLastIter)
-      return false;
-  }
+      mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE, CtxI);
+  if (!OffsetAtLastIter)
+    return false;
 
   const SCEV *OffsetEndBytes = addSCEVNoOverflow(
-      OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE);
+      OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE, CtxI);
   if (!OffsetEndBytes)
     return false;
 
   if (IsKnownNonNegative) {
     // For positive steps, check if
     //  (AR->getStart() - StartPtr) + (MaxBTC  * Step) + EltSize <= DerefBytes,
     // while making sure none of the computations unsigned wrap themselves.
-    const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE);
+    const SCEV *EndBytes =
+        addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE, CtxI);
     if (!EndBytes)
       return false;
-
-    if (!LoopGuards)
-      LoopGuards.emplace(
-          ScalarEvolution::LoopGuards::collect(AR->getLoop(), SE));
-
-    EndBytes = SE.applyLoopGuards(EndBytes, *LoopGuards);
     return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, DerefBytesSCEV);
   }
 
@@ -306,8 +296,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
     const SCEV *MaxBTC, ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
              std::pair<const SCEV *, const SCEV *>> *PointerBounds,
-    DominatorTree *DT, AssumptionCache *AC,
-    std::optional<ScalarEvolution::LoopGuards> &LoopGuards) {
+    DominatorTree *DT, AssumptionCache *AC) {
   std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
   if (PointerBounds) {
     auto [Iter, Ins] = PointerBounds->insert(
@@ -343,7 +332,7 @@ std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
       // separately checks that accesses cannot not wrap, so unsigned max
       // represents an upper bound.
       if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, DL,
-                                               DT, AC, LoopGuards)) {
+                                               DT, AC)) {
         ScEnd = AR->evaluateAtIteration(MaxBTC, *SE);
       } else {
         ScEnd = SE->getAddExpr(
@@ -392,7 +381,7 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
   const SCEV *BTC = PSE.getBackedgeTakenCount();
   const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
       Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, PSE.getSE(),
-      &DC.getPointerBounds(), DC.getDT(), DC.getAC(), LoopGuards);
+      &DC.getPointerBounds(), DC.getDT(), DC.getAC());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1998,13 +1987,13 @@ bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src,
   ScalarEvolution &SE = *PSE.getSE();
   const auto &[SrcStart_, SrcEnd_] =
       getStartAndEndForAccess(InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC,
-                              &SE, &PointerBounds, DT, AC, LoopGuards);
+                              &SE, &PointerBounds, DT, AC);
   if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_))
     return false;
 
   const auto &[SinkStart_, SinkEnd_] =
       getStartAndEndForAccess(InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC,
-                              &SE, &PointerBounds, DT, AC, LoopGuards);
+                              &SE, &PointerBounds, DT, AC);
   if (isa<SCEVCouldNotCompute>(SinkStart_) ||
       isa<SCEVCouldNotCompute>(SinkEnd_))
     return false;
@@ -3051,9 +3040,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
   DepChecker = std::make_unique<MemoryDepChecker>(
-      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards);
-  PtrRtChecking =
-      std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards);
+      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits);
+  PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
   if (canAnalyzeLoop())
     CanVecMem = analyzeLoop(AA, LI, TLI, DT);
 }
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -2338,15 +2338,23 @@ bool ScalarEvolution::willNotOverflow(Instruction::BinaryOps BinOp, bool Signed,
   // Can we use context to prove the fact we need?
   if (!CtxI)
     return false;
-  // TODO: Support mul.
-  if (BinOp == Instruction::Mul)
-    return false;
   auto *RHSC = dyn_cast<SCEVConstant>(RHS);
   // TODO: Lift this limitation.
   if (!RHSC)
     return false;
   APInt C = RHSC->getAPInt();
   unsigned NumBits = C.getBitWidth();
+  if (BinOp == Instruction::Mul) {
+    // Multiplying by 0 or 1 never overflows
+    if (C.isZero() || C.isOne())
+      return true;
+    if (Signed)
+      return false;
+    APInt Limit = APInt::getMaxValue(NumBits).udiv(C);
+    // To avoid overflow, we need to make sure that LHS <= MAX / C.
+    return isKnownPredicateAt(ICmpInst::ICMP_ULE, LHS, getConstant(Limit),
+                              CtxI);
+  }
   bool IsSub = (BinOp == Instruction::Sub);
   bool IsNegativeConst = (Signed && C.isNegative());
   // Compute the direction and magnitude by which we need to check overflow.
diff --git a/llvm/test/Transforms/LoopVectorize/vect.stats.ll b/llvm/test/Transforms/LoopVectorize/vect.stats.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization --disable-output -stats -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization -use-symbolic-maxbtc-deref-loop --disable-output -stats -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; We have 3 loops, two of them are vectorizable (with one being early-exit

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization --disable-output -stats -S 2>&1 \| FileCheck %s`
	`1`	`+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization -use-symbolic-maxbtc-deref-loop --disable-output -stats -S 2>&1 \| FileCheck %s`
`2`	`2`	`; REQUIRES: asserts`
`3`	`3`
`4`	`4`	`; We have 3 loops, two of them are vectorizable (with one being early-exit`