[analyzer] Suppress out of bounds reports after weak loop assumptions

NagyDonat · NagyDonat · commit 23b27377e556 · 2024-09-24T16:12:48.000+02:00
The checker alpha.security.ArrayBoundV2 produced lots of false positives
in situations where loop modeling of the engine fed it with unfounded
assumptions.

This commit introduces a heuristic that discards ArrayBoundV2 reports
when the execution path introduces an assumption that is questionable.
More precisely, two kinds of assumptions are categorized as "weak":
(1) When the analyzer assumes that the first evaluation of the loop
    condition returns false and the loop body is completely skipped.
(2) When the analyzer assumes that the loop condition is true in a
    situation where it already executed (at least) two iterations.
For examples and more explanation, see the new tests.

The actual implementation uses some approximations (it uses the
BlockCount instead of the iteration count) because that seems to be
"good enough" for this heuristical suppression.

Note that I used minor state updates instead of bug reporter visitors
because the number of finished iterations is not visible in the visitor
which "walks backwards in time".

As a very minor unrelated change, this commit removes the "Bin" part
from the method name "evalEagerlyAssumeBinOpBifurcation" because this
method is also used for the unary logical not operator.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -121,6 +121,25 @@ struct EvalCallOptions {
   EvalCallOptions() {}
 };
 
+/// Simple control flow statements like `if` only produce a single state split,
+/// so the fact that they are included in the source code implies that both
+/// branches are possible (at least under some conditions) and the analyzer can
+/// freely assume either of them. (This is not entirely true, because there may
+/// be unmarked logical correlations between `if` statements, but is a good
+/// enough heuristic and the analyzer strongly relies on it.)
+/// On the other hand, in a loop the state may be split repeatedly at each
+/// evaluation of the loop condition, and this can lead to following "weak"
+/// assumptions even though the code does not imply that they're valid and the
+/// programmer intended to cover them.
+/// This function is called to mark the `State` when the engine makes an
+/// assumption which is weak. Checkers may use this heuristical mark to discard
+/// result and reduce the amount of false positives.
+ProgramStateRef recordWeakLoopAssumption(ProgramStateRef State);
+
+/// Returns true if `recordWeakLoopAssumption()` was called on the execution
+/// path which produced `State`.
+bool seenWeakLoopAssumption(ProgramStateRef State);
+
 class ExprEngine {
   void anchor();
 
@@ -323,12 +342,13 @@ class ExprEngine {
 
   /// ProcessBranch - Called by CoreEngine.  Used to generate successor
   ///  nodes by processing the 'effects' of a branch condition.
-  void processBranch(const Stmt *Condition,
-                     NodeBuilderContext& BuilderCtx,
-                     ExplodedNode *Pred,
-                     ExplodedNodeSet &Dst,
-                     const CFGBlock *DstT,
-                     const CFGBlock *DstF);
+  /// If the branch condition is a loop condition, IterationsFinishedInLoop is
+  /// the number of already finished iterations (0, 1, 2...); otherwise it's
+  /// std::nullopt.
+  void processBranch(const Stmt *Condition, NodeBuilderContext &BuilderCtx,
+                     ExplodedNode *Pred, ExplodedNodeSet &Dst,
+                     const CFGBlock *DstT, const CFGBlock *DstF,
+                     std::optional<unsigned> IterationsFinishedInLoop);
 
   /// Called by CoreEngine.
   /// Used to generate successor nodes for temporary destructors depending
@@ -583,11 +603,12 @@ class ExprEngine {
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst);
 
-  /// evalEagerlyAssumeBinOpBifurcation - Given the nodes in 'Src', eagerly assume symbolic
-  ///  expressions of the form 'x != 0' and generate new nodes (stored in Dst)
-  ///  with those assumptions.
-  void evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
-                         const Expr *Ex);
+  /// evalEagerlyAssumeOpBifurcation - Given the nodes in 'Src', eagerly assume
+  /// symbolic
+  ///  expressions of the form 'x != 0' or '!x' and generate new nodes (stored
+  ///  in Dst) with those assumptions.
+  void evalEagerlyAssumeOpBifurcation(ExplodedNodeSet &Dst,
+                                      ExplodedNodeSet &Src, const Expr *Ex);
 
   static std::pair<const ProgramPointTag *, const ProgramPointTag *>
     geteagerlyAssumeBinOpBifurcationTags();
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -697,6 +697,11 @@ void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
                                     ProgramStateRef ErrorState, Messages Msgs,
                                     NonLoc Offset, std::optional<NonLoc> Extent,
                                     bool IsTaintBug /*=false*/) const {
+  // Suppress results found through execution paths where in some loop the
+  // analyzer arbitrarily assumed either that the loop is skipped (0 iterations)
+  // or that 3 or more iterations are executed.
+  if (seenWeakLoopAssumption(ErrorState))
+    return;
 
   ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
   if (!ErrorNode)
diff --git a/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp b/clang/lib/StaticAnalyzer/Core/CoreEngine.cpp
@@ -441,10 +441,33 @@ void CoreEngine::HandleCallEnter(const CallEnter &CE, ExplodedNode *Pred) {
 void CoreEngine::HandleBranch(const Stmt *Cond, const Stmt *Term,
                                 const CFGBlock * B, ExplodedNode *Pred) {
   assert(B->succ_size() == 2);
+
+  const LocationContext *LC = Pred->getLocationContext();
+  BlockCounter Counter = WList->getBlockCounter();
+  unsigned BlockCount =
+      Counter.getNumVisited(LC->getStackFrame(), B->getBlockID());
+  std::optional<unsigned> IterationsFinishedInLoop = std::nullopt;
+  if (isa<ForStmt, WhileStmt, CXXForRangeStmt>(Term)) {
+    // FIXME: This code approximates the number of finished iteration based on
+    // the block count, i.e. the number of evaluations of the terminator block
+    // on the current execution path (which includes the current evaluation, so
+    // is always at least 1). This is probably acceptable for the
+    // checker-specific false positive suppression that currently uses this
+    // value, but it would be better to calcuate an accurate count of
+    // iterations.
+    assert(BlockCount >= 1);
+    IterationsFinishedInLoop = BlockCount - 1;
+  } else if (isa<DoStmt>(Term)) {
+    // FIXME: The fixme note in the previous branch also applies here.
+    // In a do-while loop one iteration happens before the first evaluation of
+    // the loop condition, so we don't subtract one from the block count.
+    IterationsFinishedInLoop = BlockCount;
+  }
+
   NodeBuilderContext Ctx(*this, B, Pred);
   ExplodedNodeSet Dst;
   ExprEng.processBranch(Cond, Ctx, Pred, Dst, *(B->succ_begin()),
-                       *(B->succ_begin() + 1));
+                        *(B->succ_begin() + 1), IterationsFinishedInLoop);
   // Enqueue the new frontier onto the worklist.
   enqueue(Dst);
 }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -212,6 +212,20 @@ typedef llvm::ImmutableMap<const LocationContext *, unsigned>
 REGISTER_TRAIT_WITH_PROGRAMSTATE(PendingArrayDestruction,
                                  PendingArrayDestructionMap)
 
+// This trait is used to heuristically filter out results produced from
+// execution paths that took "weak" assumptions within a loop.
+REGISTER_TRAIT_WITH_PROGRAMSTATE(SeenWeakLoopAssumption, bool)
+
+ProgramStateRef clang::ento::recordWeakLoopAssumption(ProgramStateRef State) {
+  return State->set<SeenWeakLoopAssumption>(true);
+}
+
+bool clang::ento::seenWeakLoopAssumption(ProgramStateRef State) {
+  return State->get<SeenWeakLoopAssumption>();
+}
+
+REGISTER_TRAIT_WITH_PROGRAMSTATE(LastEagerlyAssumeAssumptionAt, const Expr *)
+
 //===----------------------------------------------------------------------===//
 // Engine construction and deletion.
 //===----------------------------------------------------------------------===//
@@ -2128,7 +2142,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
           (B->isRelationalOp() || B->isEqualityOp())) {
         ExplodedNodeSet Tmp;
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, cast<Expr>(S));
+        evalEagerlyAssumeOpBifurcation(Dst, Tmp, cast<Expr>(S));
       }
       else
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Dst);
@@ -2401,7 +2415,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       if (AMgr.options.ShouldEagerlyAssume && (U->getOpcode() == UO_LNot)) {
         ExplodedNodeSet Tmp;
         VisitUnaryOperator(U, Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, U);
+        evalEagerlyAssumeOpBifurcation(Dst, Tmp, U);
       }
       else
         VisitUnaryOperator(U, Pred, Dst);
@@ -2761,12 +2775,10 @@ assumeCondition(const Stmt *Condition, ExplodedNode *N) {
   return State->assume(V);
 }
 
-void ExprEngine::processBranch(const Stmt *Condition,
-                               NodeBuilderContext& BldCtx,
-                               ExplodedNode *Pred,
-                               ExplodedNodeSet &Dst,
-                               const CFGBlock *DstT,
-                               const CFGBlock *DstF) {
+void ExprEngine::processBranch(
+    const Stmt *Condition, NodeBuilderContext &BldCtx, ExplodedNode *Pred,
+    ExplodedNodeSet &Dst, const CFGBlock *DstT, const CFGBlock *DstF,
+    std::optional<unsigned> IterationsFinishedInLoop) {
   assert((!Condition || !isa<CXXBindTemporaryExpr>(Condition)) &&
          "CXXBindTemporaryExprs are handled by processBindTemporary.");
   const LocationContext *LCtx = Pred->getLocationContext();
@@ -2808,27 +2820,63 @@ void ExprEngine::processBranch(const Stmt *Condition,
       std::tie(StTrue, StFalse) = *KnownCondValueAssumption;
     else {
       assert(!isa<ObjCForCollectionStmt>(Condition));
+      // TODO: instead of this shortcut perhaps it would be better to "rejoin"
+      // the common execution path with
+      // StTrue = StFalse = PrevState;
       builder.generateNode(PrevState, true, PredN);
       builder.generateNode(PrevState, false, PredN);
       continue;
     }
     if (StTrue && StFalse)
       assert(!isa<ObjCForCollectionStmt>(Condition));
 
+    const Expr *EagerlyAssumeExpr =
+        PrevState->get<LastEagerlyAssumeAssumptionAt>();
+    const Expr *ConditionExpr = dyn_cast<Expr>(Condition);
+    if (ConditionExpr)
+      ConditionExpr = ConditionExpr->IgnoreParenCasts();
+    bool DidEagerlyAssume = EagerlyAssumeExpr == ConditionExpr;
+    bool BothFeasible = (DidEagerlyAssume || (StTrue && StFalse)) &&
+                        builder.isFeasible(true) && builder.isFeasible(false);
+
     // Process the true branch.
     if (builder.isFeasible(true)) {
-      if (StTrue)
+      if (StTrue) {
+        if (BothFeasible && IterationsFinishedInLoop &&
+            *IterationsFinishedInLoop >= 2) {
+          // When programmers write a loop, they imply that at least two
+          // iterations are possible (otherwise they would just write an `if`),
+          // but the third iteration is not implied: there are situations where
+          // the programmer knows that there won't be a third iteration (e.g.
+          // they iterate over a structure that has <= 2 elements) but this is
+          // not marked in the source code.
+          // Checkers may use this heuristic mark to discard results found on
+          // branches that contain this "weak" assumption.
+          StTrue = recordWeakLoopAssumption(StTrue);
+        }
         builder.generateNode(StTrue, true, PredN);
-      else
+      } else {
         builder.markInfeasible(true);
+      }
     }
 
     // Process the false branch.
     if (builder.isFeasible(false)) {
-      if (StFalse)
+      if (StFalse) {
+        if (BothFeasible && IterationsFinishedInLoop &&
+            *IterationsFinishedInLoop == 0) {
+          // There are many situations where the programmers know that there
+          // will be at least one iteration in a loop (e.g. a structure is not
+          // empty) but the analyzer cannot deduce this and reports false
+          // positives after skipping the loop.
+          // Checkers may use this heuristic mark to discard results found on
+          // branches that contain this "weak" assumption.
+          StFalse = recordWeakLoopAssumption(StFalse);
+        }
         builder.generateNode(StFalse, false, PredN);
-      else
+      } else {
         builder.markInfeasible(false);
+      }
     }
   }
   currBldrCtx = nullptr;
@@ -3752,9 +3800,9 @@ ExprEngine::geteagerlyAssumeBinOpBifurcationTags() {
                         &eagerlyAssumeBinOpBifurcationFalse);
 }
 
-void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
-                                                   ExplodedNodeSet &Src,
-                                                   const Expr *Ex) {
+void ExprEngine::evalEagerlyAssumeOpBifurcation(ExplodedNodeSet &Dst,
+                                                ExplodedNodeSet &Src,
+                                                const Expr *Ex) {
   StmtNodeBuilder Bldr(Src, Dst, *currBldrCtx);
 
   for (const auto Pred : Src) {
@@ -3776,6 +3824,11 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
       ProgramStateRef StateTrue, StateFalse;
       std::tie(StateTrue, StateFalse) = state->assume(*SEV);
 
+      if (StateTrue && StateFalse) {
+        StateTrue = StateTrue->set<LastEagerlyAssumeAssumptionAt>(Ex);
+        StateFalse = StateFalse->set<LastEagerlyAssumeAssumptionAt>(Ex);
+      }
+
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
diff --git a/clang/test/Analysis/loop-unrolling.cpp b/clang/test/Analysis/loop-unrolling.cpp
@@ -349,7 +349,7 @@ int simple_unknown_bound_loop() {
 #ifdef DFS
     clang_analyzer_numTimesReached(); // expected-warning {{16}}
 #else
-    clang_analyzer_numTimesReached(); // expected-warning {{8}}
+    clang_analyzer_numTimesReached(); // expected-warning {{10}}
 #endif
   }
   return 0;
@@ -369,9 +369,9 @@ int nested_inlined_no_unroll1() {
   int k;
   for (int i = 0; i < 9; i++) {
 #ifdef DFS
-    clang_analyzer_numTimesReached(); // expected-warning {{18}}
+    clang_analyzer_numTimesReached(); // expected-warning {{20}}
 #else
-    clang_analyzer_numTimesReached(); // expected-warning {{14}}
+    clang_analyzer_numTimesReached(); // expected-warning {{18}}
 #endif
     k = simple_unknown_bound_loop();  // reevaluation without inlining, splits the state as well
   }
diff --git a/clang/test/Analysis/out-of-bounds.c b/clang/test/Analysis/out-of-bounds.c
@@ -1,4 +1,9 @@
 // RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -Wno-array-bounds -analyzer-checker=core,alpha.security.ArrayBoundV2,debug.ExprInspection -analyzer-config eagerly-assume=false -verify %s
+
+// Note that eagerly-assume=false is tested separately because the
+// WeakLoopAssumption suppression heuristic uses different code paths to
+// achieve the same result with and without eagerly-assume.
 
 void clang_analyzer_eval(int);
 
@@ -194,3 +199,99 @@ char test_comparison_with_extent_symbol(struct incomplete *p) {
   return ((char *)p)[-1]; // no-warning
 }
 
+// WeakLoopAssumption suppression
+///////////////////////////////////////////////////////////////////////
+
+int GlobalArray[100];
+int loop_suppress_after_zero_iterations(unsigned len) {
+  for (unsigned i = 0; i < len; i++)
+    if (GlobalArray[i] > 0)
+      return GlobalArray[i];
+  // Previously this would have produced an overflow warning because splitting
+  // the state on the loop condition introduced an execution path where the
+  // analyzer thinks that len == 0.
+  // There are very many situations where the programmer knows that an argument
+  // is positive, but this is not indicated in the source code, so we must
+  // avoid reporting errors (especially out of bounds errors) on these
+  // branches, because otherwise we'd get prohibitively many false positives.
+  return GlobalArray[len - 1]; // no-warning
+}
+
+void loop_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  for (int i = 0; i < len; i++) {
+    // When a programmer writes a loop, we may assume that they intended at
+    // least two iterations.
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  for (int i = 0; i < len; i++) {
+    // We should suppress array bounds errors on the third and later iterations
+    // of loops, because sometimes programmers write a loop in sitiuations
+    // where they know that there will be at most two iterations.
+    buf[i] = 1; // no-warning
+  }
+}
+
+void loop_suppress_in_third_iteration_cast(int len) {
+  int buf[2] = {0};
+  for (int i = 0; (unsigned)(i < len); i++) {
+    // Check that a (somewhat arbitrary) cast does not hinder the recognition
+    // of the condition expression.
+    buf[i] = 1; // no-warning
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; i < len && flag; i++) {
+    // FIXME: In this case the checker should suppress the warning the same way
+    // as it's suppressed in loop_suppress_in_third_iteration, but the
+    // suppression is not activated because the terminator statement associated
+    // with the loop is just the expression 'flag', while 'i < len' is a
+    // separate terminator statement that's associated with the
+    // short-circuiting operator '&&'.
+    // I have seen a real-world FP that looks like this, but it is much rarer
+    // than the basic setup.
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  }
+}
+
+void loop_suppress_in_third_iteration_logical_and_2(int len, int flag) {
+  int buf[2] = {0};
+  for (int i = 0; flag && i < len; i++) {
+    // If the two operands of '&&' are flipped, the suppression works.
+    buf[i] = 1; // no-warning
+  }
+}
+
+int coinflip(void);
+int do_while_report_after_one_iteration(void) {
+  int i = 0;
+  do {
+    i++;
+  } while (coinflip());
+  // Unlike `loop_suppress_after_zero_iterations`, running just one iteration
+  // in a do-while is not a corner case that would produce too many false
+  // positives, so don't suppress bounds errors in these situations.
+  return GlobalArray[i-2]; // expected-warning{{Out of bound access to memory}}
+}
+
+void do_while_report_in_second_iteration(int len) {
+  int buf[1] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // expected-warning{{Out of bound access to memory}}
+  } while (i++ < len);
+}
+
+void do_while_suppress_in_third_iteration(int len) {
+  int buf[2] = {0};
+  int i = 0;
+  do {
+    buf[i] = 1; // no-warning
+  } while (i++ < len);
+}