From a4b9f80e85d0fdd0e0e6390eb4548c88aa3fd35e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 16 Jul 2024 19:54:11 +0100
Subject: [PATCH 1/6] ConstraintElim: add dry-run routine to fail early

Add a dry-run routine that computes a conservative estimate of the
number of rows and columns that the transform will require, and fail
early if the estimates exceed the upper bounds. This patch has a small
overhead, but improves compile-time on one benchmark significantly. The
overhead will be compensated for in a follow-up patch, where
ConstraintSystem is ported to use a Matrix data structure, performing
the full allocation ahead-of-time using these estimates.
---
 .../Scalar/ConstraintElimination.cpp          | 175 +++++++++++++++++-
 ...x-row-limit.ll => max-row-column-limit.ll} |  17 +-
 2 files changed, 174 insertions(+), 18 deletions(-)
 rename llvm/test/Transforms/ConstraintElimination/{max-row-limit.ll => max-row-column-limit.ll} (81%)
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index c31173879af1e..369a6daa4d970 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -40,7 +40,6 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
-#include <cmath>
 #include <optional>
 #include <string>
 
@@ -57,6 +56,10 @@ static cl::opt<unsigned>
     MaxRows("constraint-elimination-max-rows", cl::init(500), cl::Hidden,
             cl::desc("Maximum number of rows to keep in constraint system"));
 
+static cl::opt<unsigned> MaxColumns(
+    "constraint-elimination-max-cols", cl::init(50), cl::Hidden,
+    cl::desc("Maximum number of columns to keep in constraint system"));
+
 static cl::opt<bool> DumpReproducers(
     "constraint-elimination-dump-reproducers", cl::init(false), cl::Hidden,
     cl::desc("Dump IR to reproduce successful transformations."));
@@ -303,6 +306,7 @@ class ConstraintInfo {
   void popLastNVariables(bool Signed, unsigned N) {
     getCS(Signed).popLastNVariables(N);
   }
+  const DataLayout &getDataLayout() const { return DL; }
 
   bool doesHold(CmpInst::Predicate Pred, Value *A, Value *B) const;
 
@@ -1491,7 +1495,7 @@ removeEntryFromStack(const StackEntry &E, ConstraintInfo &Info,
 /// Check if either the first condition of an AND or OR is implied by the
 /// (negated in case of OR) second condition or vice versa.
 static bool checkOrAndOpImpliedByOther(
-    FactOrCheck &CB, ConstraintInfo &Info, Module *ReproducerModule,
+    const FactOrCheck &CB, ConstraintInfo &Info, Module *ReproducerModule,
     SmallVectorImpl<ReproducerEntry> &ReproducerCondStack,
     SmallVectorImpl<StackEntry> &DFSInStack) {
 
@@ -1671,18 +1675,91 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
   return Changed;
 }
 
-static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
-                                 ScalarEvolution &SE,
-                                 OptimizationRemarkEmitter &ORE) {
-  bool Changed = false;
+/// Performs a dry run of AddFact, computing a conservative estimate of the
+/// number of new variables introduced.
+static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
+                          const ConstraintInfo &Info, unsigned &EstimatedRowsA,
+                          unsigned &EstimatedRowsB,
+                          unsigned &EstimatedColumns) {
+  auto UpdateEstimate = [&Info, &EstimatedRowsA, &EstimatedRowsB,
+                         &EstimatedColumns](CmpInst::Predicate Pred, Value *A,
+                                            Value *B) {
+    SmallVector<Value *> NewVars;
+    auto R = Info.getConstraint(Pred, A, B, NewVars);
+
+    // We offset it by 1 due to logic in addFact.
+    unsigned NewEstimate =
+        count_if(R.Coefficients, [](int64_t C) { return C != 0; }) + 1;
+
+    EstimatedColumns = std::max(EstimatedColumns, NewEstimate);
+    if (R.IsSigned)
+      ++EstimatedRowsA;
+    else
+      ++EstimatedRowsB;
+  };
+
+  UpdateEstimate(Pred, A, B);
+
+  // What follows is a dry-run of transferToOtherSystem.
+  auto IsKnownNonNegative = [&Info](Value *V) {
+    return Info.doesHold(CmpInst::ICMP_SGE, V,
+                         ConstantInt::get(V->getType(), 0)) ||
+           isKnownNonNegative(V, Info.getDataLayout(),
+                              MaxAnalysisRecursionDepth - 1);
+  };
+
+  if (!A->getType()->isIntegerTy())
+    return;
+
+  switch (Pred) {
+  default:
+    break;
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    if (IsKnownNonNegative(B)) {
+      UpdateEstimate(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0));
+      UpdateEstimate(CmpInst::getSignedPredicate(Pred), A, B);
+    }
+    break;
+  case CmpInst::ICMP_UGE:
+  case CmpInst::ICMP_UGT:
+    if (IsKnownNonNegative(A)) {
+      UpdateEstimate(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0));
+      UpdateEstimate(CmpInst::getSignedPredicate(Pred), A, B);
+    }
+    break;
+  case CmpInst::ICMP_SLT:
+    if (IsKnownNonNegative(A))
+      UpdateEstimate(CmpInst::ICMP_ULT, A, B);
+    break;
+  case CmpInst::ICMP_SGT:
+    if (Info.doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), -1)))
+      UpdateEstimate(CmpInst::ICMP_UGE, A, ConstantInt::get(B->getType(), 0));
+    if (IsKnownNonNegative(B))
+      UpdateEstimate(CmpInst::ICMP_UGT, A, B);
+    break;
+  case CmpInst::ICMP_SGE:
+    if (IsKnownNonNegative(B))
+      UpdateEstimate(CmpInst::ICMP_UGE, A, B);
+    break;
+  }
+}
+
+/// Performs a dry run of the transform, computing a conservative estimate of
+/// the total number of columns we need in the underlying storage.
+static std::tuple<State, unsigned, unsigned>
+dryRun(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) {
   DT.updateDFSNumbers();
   SmallVector<Value *> FunctionArgs;
   for (Value &Arg : F.args())
     FunctionArgs.push_back(&Arg);
-  ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
   State S(DT, LI, SE);
-  std::unique_ptr<Module> ReproducerModule(
-      DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
+  unsigned EstimatedColumns = FunctionArgs.size() + 1;
+
+  // EstimatedRowsA corresponds to SignedCS, and EstimatedRowsB corresponds to
+  // UnsignedCS.
+  unsigned EstimatedRowsA = 0, EstimatedRowsB = 1;
+  ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
 
   // First, collect conditions implied by branches and blocks with their
   // Dominator DFS in and out numbers.
@@ -1725,12 +1802,90 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
     return A.NumIn < B.NumIn;
   });
 
+  for (const FactOrCheck &CB : S.WorkList) {
+    ICmpInst::Predicate Pred;
+    Value *A, *B;
+    if (CB.isCheck()) {
+      // What follows is a dry-run of checkOrAndOpImpliedByOther, without
+      // assuming that instructions have been simplified, as they would have
+      // during the course of normal operation.
+      auto *ContextInst = CB.getContextInst();
+      if (auto *Cmp =
+              dyn_cast_or_null<ICmpInst>(CB.getInstructionToSimplify())) {
+        unsigned OtherOpIdx = ContextInst->getOperand(0) == Cmp ? 1 : 0;
+        if (match(ContextInst, m_LogicalOp()) &&
+            match(ContextInst->getOperand(OtherOpIdx),
+                  m_ICmp(Pred, m_Value(A), m_Value(B)))) {
+          if (match(ContextInst, m_LogicalOr()))
+            Pred = CmpInst::getInversePredicate(Pred);
+          dryRunAddFact(Pred, A, B, Info, EstimatedRowsA, EstimatedRowsB,
+                        EstimatedColumns);
+        }
+      }
+      continue;
+    }
+    if (!CB.isConditionFact()) {
+      Value *X;
+      if (match(CB.Inst, m_Intrinsic<Intrinsic::abs>(m_Value(X)))) {
+        if (cast<ConstantInt>(CB.Inst->getOperand(1))->isOne())
+          dryRunAddFact(CmpInst::ICMP_SGE, CB.Inst,
+                        ConstantInt::get(CB.Inst->getType(), 0), Info,
+                        EstimatedRowsA, EstimatedRowsB, EstimatedColumns);
+        dryRunAddFact(CmpInst::ICMP_SGE, CB.Inst, X, Info, EstimatedRowsA,
+                      EstimatedRowsB, EstimatedColumns);
+        continue;
+      }
+
+      if (auto *MinMax = dyn_cast<MinMaxIntrinsic>(CB.Inst)) {
+        Pred = ICmpInst::getNonStrictPredicate(MinMax->getPredicate());
+        dryRunAddFact(Pred, MinMax, MinMax->getLHS(), Info, EstimatedRowsA,
+                      EstimatedRowsB, EstimatedColumns);
+        dryRunAddFact(Pred, MinMax, MinMax->getRHS(), Info, EstimatedRowsA,
+                      EstimatedRowsB, EstimatedColumns);
+        continue;
+      }
+    }
+
+    if (CB.isConditionFact()) {
+      Pred = CB.Cond.Pred;
+      A = CB.Cond.Op0;
+      B = CB.Cond.Op1;
+    } else {
+      bool Matched = match(CB.Inst, m_Intrinsic<Intrinsic::assume>(
+                                        m_ICmp(Pred, m_Value(A), m_Value(B))));
+      (void)Matched;
+      assert(Matched && "Must have an assume intrinsic with a icmp operand");
+    }
+    dryRunAddFact(Pred, A, B, Info, EstimatedRowsA, EstimatedRowsB,
+                  EstimatedColumns);
+  }
+  return {S, std::max(EstimatedRowsA, EstimatedRowsB), EstimatedColumns};
+}
+
+static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
+                                 ScalarEvolution &SE,
+                                 OptimizationRemarkEmitter &ORE) {
+  bool Changed = false;
+  const auto &[S, EstimatedRows, EstimatedColumns] = dryRun(F, DT, LI, SE);
+
+  // Fail early if estimates exceed limits. Row estimate could be off by up to
+  // 40%.
+  if (EstimatedRows > 1.4 * MaxRows || EstimatedColumns > MaxColumns)
+    return false;
+
+  SmallVector<Value *> FunctionArgs;
+  for (Value &Arg : F.args())
+    FunctionArgs.push_back(&Arg);
+  ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
+  std::unique_ptr<Module> ReproducerModule(
+      DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
+
   SmallVector<Instruction *> ToRemove;
 
   // Finally, process ordered worklist and eliminate implied conditions.
   SmallVector<StackEntry, 16> DFSInStack;
   SmallVector<ReproducerEntry> ReproducerCondStack;
-  for (FactOrCheck &CB : S.WorkList) {
+  for (const FactOrCheck &CB : S.WorkList) {
     // First, pop entries from the stack that are out-of-scope for CB. Remove
     // the corresponding entry from the constraint system.
     while (!DFSInStack.empty()) {
diff --git a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll b/llvm/test/Transforms/ConstraintElimination/max-row-column-limit.ll
similarity index 81%
rename from llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
rename to llvm/test/Transforms/ConstraintElimination/max-row-column-limit.ll
index 0e078109ed663..2f3b62dc5dab7 100644
--- a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
+++ b/llvm/test/Transforms/ConstraintElimination/max-row-column-limit.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=constraint-elimination -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=9 -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=8 -S %s | FileCheck --check-prefixes=COMMON,NOSIMP %s
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck --check-prefixes=SIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=8 -S %s | FileCheck --check-prefixes=SIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-cols=6 -S %s | FileCheck --check-prefixes=SIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=7 -S %s | FileCheck --check-prefixes=NOSIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-cols=5 -S %s | FileCheck --check-prefixes=NOSIMP %s
 
 
 define i1 @test_max_row_limit(i32 %l0, i32 %l1, i32 %l2, i32 %l3, i32 %l4) {
@@ -22,7 +24,8 @@ define i1 @test_max_row_limit(i32 %l0, i32 %l1, i32 %l2, i32 %l3, i32 %l4) {
 ; SIMP-NEXT:    [[C4:%.*]] = icmp uge i32 [[L4:%.*]], 100
 ; SIMP-NEXT:    br i1 [[C4]], label [[BB5:%.*]], label [[EXIT]]
 ; SIMP:       bb5:
-; SIMP-NEXT:    ret i1 true
+; SIMP-NEXT:    [[C5:%.*]] = icmp sge i32 [[L4:%.*]], 100
+; SIMP-NEXT:    ret i1 [[C5]]
 ; SIMP:       exit:
 ; SIMP-NEXT:    ret i1 false
 ;
@@ -43,7 +46,7 @@ define i1 @test_max_row_limit(i32 %l0, i32 %l1, i32 %l2, i32 %l3, i32 %l4) {
 ; NOSIMP-NEXT:    [[C4:%.*]] = icmp uge i32 [[L4:%.*]], 100
 ; NOSIMP-NEXT:    br i1 [[C4]], label [[BB5:%.*]], label [[EXIT]]
 ; NOSIMP:       bb5:
-; NOSIMP-NEXT:    [[C5:%.*]] = icmp uge i32 [[L4]], 100
+; NOSIMP-NEXT:    [[C5:%.*]] = icmp sge i32 [[L4]], 100
 ; NOSIMP-NEXT:    ret i1 [[C5]]
 ; NOSIMP:       exit:
 ; NOSIMP-NEXT:    ret i1 false
@@ -69,11 +72,9 @@ bb4:
   br i1 %c4, label %bb5, label %exit
 
 bb5:
-  %c5 = icmp uge i32 %l4, 100
+  %c5 = icmp sge i32 %l4, 100
   ret i1 %c5
 
 exit:
   ret i1 false
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; COMMON: {{.*}}

From 50c2e95b9a66395d5f6b62a99e67fec1a874f8f5 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sat, 20 Jul 2024 13:32:16 +0100
Subject: [PATCH 2/6] ConstraintElim: attempt to make estimation cheaper

---
 .../Scalar/ConstraintElimination.cpp          | 70 ++++++++++---------
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 369a6daa4d970..2c12c4fafd84a 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1684,30 +1684,47 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
   auto UpdateEstimate = [&Info, &EstimatedRowsA, &EstimatedRowsB,
                          &EstimatedColumns](CmpInst::Predicate Pred, Value *A,
                                             Value *B) {
-    SmallVector<Value *> NewVars;
-    auto R = Info.getConstraint(Pred, A, B, NewVars);
+    // What follows is a simplified dry-run of Info.getConstraint and addFact.
+    unsigned NumNewVars = 0;
+    bool IsSigned = false;
+
+    switch (Pred) {
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_NE:
+    case CmpInst::ICMP_EQ:
+      break;
+    case CmpInst::ICMP_SGT:
+    case CmpInst::ICMP_SGE:
+      IsSigned = true;
+      break;
+    default:
+      return;
+    }
 
-    // We offset it by 1 due to logic in addFact.
-    unsigned NewEstimate =
-        count_if(R.Coefficients, [](int64_t C) { return C != 0; }) + 1;
+    SmallVector<ConditionTy, 4> Preconditions;
+    auto &Value2Index = Info.getValue2Index(IsSigned);
+    auto ADec = decompose(A->stripPointerCastsSameRepresentation(),
+                          Preconditions, IsSigned, Info.getDataLayout());
+    auto BDec = decompose(B->stripPointerCastsSameRepresentation(),
+                          Preconditions, IsSigned, Info.getDataLayout());
+    for (const auto &KV : concat<DecompEntry>(ADec.Vars, BDec.Vars)) {
+      if (!Value2Index.contains(KV.Variable))
+        ++NumNewVars;
+    }
 
-    EstimatedColumns = std::max(EstimatedColumns, NewEstimate);
-    if (R.IsSigned)
+    if (IsSigned)
       ++EstimatedRowsA;
     else
       ++EstimatedRowsB;
+
+    EstimatedColumns =
+        std::max(EstimatedColumns, NumNewVars + Value2Index.size() + 2);
   };
 
   UpdateEstimate(Pred, A, B);
 
-  // What follows is a dry-run of transferToOtherSystem.
-  auto IsKnownNonNegative = [&Info](Value *V) {
-    return Info.doesHold(CmpInst::ICMP_SGE, V,
-                         ConstantInt::get(V->getType(), 0)) ||
-           isKnownNonNegative(V, Info.getDataLayout(),
-                              MaxAnalysisRecursionDepth - 1);
-  };
-
+  // What follows is a simplified dry-run of transferToOtherSystem.
   if (!A->getType()->isIntegerTy())
     return;
 
@@ -1716,31 +1733,20 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
     break;
   case CmpInst::ICMP_ULT:
   case CmpInst::ICMP_ULE:
-    if (IsKnownNonNegative(B)) {
-      UpdateEstimate(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0));
-      UpdateEstimate(CmpInst::getSignedPredicate(Pred), A, B);
-    }
-    break;
   case CmpInst::ICMP_UGE:
   case CmpInst::ICMP_UGT:
-    if (IsKnownNonNegative(A)) {
-      UpdateEstimate(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), 0));
-      UpdateEstimate(CmpInst::getSignedPredicate(Pred), A, B);
-    }
+    UpdateEstimate(CmpInst::ICMP_SGE, A, ConstantInt::get(B->getType(), 0));
+    UpdateEstimate(CmpInst::getSignedPredicate(Pred), A, B);
     break;
   case CmpInst::ICMP_SLT:
-    if (IsKnownNonNegative(A))
-      UpdateEstimate(CmpInst::ICMP_ULT, A, B);
+    UpdateEstimate(CmpInst::ICMP_ULT, A, B);
     break;
   case CmpInst::ICMP_SGT:
-    if (Info.doesHold(CmpInst::ICMP_SGE, B, ConstantInt::get(B->getType(), -1)))
-      UpdateEstimate(CmpInst::ICMP_UGE, A, ConstantInt::get(B->getType(), 0));
-    if (IsKnownNonNegative(B))
-      UpdateEstimate(CmpInst::ICMP_UGT, A, B);
+    UpdateEstimate(CmpInst::ICMP_UGE, A, ConstantInt::get(B->getType(), 0));
+    UpdateEstimate(CmpInst::ICMP_UGT, A, B);
     break;
   case CmpInst::ICMP_SGE:
-    if (IsKnownNonNegative(B))
-      UpdateEstimate(CmpInst::ICMP_UGE, A, B);
+    UpdateEstimate(CmpInst::ICMP_UGE, A, B);
     break;
   }
 }

From 11b306fae6c365a152d8fb9b61846ca733ac87c1 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sun, 21 Jul 2024 14:38:05 +0100
Subject: [PATCH 3/6] ConstraintElim: avoid running decompose multiple times

---
 .../Scalar/ConstraintElimination.cpp          | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 2c12c4fafd84a..334fb8e5d6752 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -1678,7 +1679,7 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
 /// Performs a dry run of AddFact, computing a conservative estimate of the
 /// number of new variables introduced.
 static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
-                          const ConstraintInfo &Info, unsigned &EstimatedRowsA,
+                          ConstraintInfo &Info, unsigned &EstimatedRowsA,
                           unsigned &EstimatedRowsB,
                           unsigned &EstimatedColumns) {
   auto UpdateEstimate = [&Info, &EstimatedRowsA, &EstimatedRowsB,
@@ -1704,12 +1705,24 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
 
     SmallVector<ConditionTy, 4> Preconditions;
     auto &Value2Index = Info.getValue2Index(IsSigned);
-    auto ADec = decompose(A->stripPointerCastsSameRepresentation(),
-                          Preconditions, IsSigned, Info.getDataLayout());
-    auto BDec = decompose(B->stripPointerCastsSameRepresentation(),
-                          Preconditions, IsSigned, Info.getDataLayout());
-    for (const auto &KV : concat<DecompEntry>(ADec.Vars, BDec.Vars)) {
-      if (!Value2Index.contains(KV.Variable))
+    Value *AStrip = A->stripPointerCastsSameRepresentation();
+    Value *BStrip = B->stripPointerCastsSameRepresentation();
+    SmallVector<DecompEntry> AVars, BVars;
+
+    if (!Value2Index.contains(AStrip)) {
+      AVars =
+          decompose(AStrip, Preconditions, IsSigned, Info.getDataLayout()).Vars;
+      Value2Index.insert({AStrip, Value2Index.size() + 1});
+    }
+    if (!Value2Index.contains(BStrip)) {
+      BVars =
+          decompose(BStrip, Preconditions, IsSigned, Info.getDataLayout()).Vars;
+      Value2Index.insert({BStrip, Value2Index.size() + 1});
+    }
+
+    for (const auto &KV : concat<DecompEntry>(AVars, BVars)) {
+      if (KV.Variable == AStrip || KV.Variable == BStrip ||
+          !Value2Index.contains(KV.Variable))
         ++NumNewVars;
     }
 
@@ -1718,8 +1731,7 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
     else
       ++EstimatedRowsB;
 
-    EstimatedColumns =
-        std::max(EstimatedColumns, NumNewVars + Value2Index.size() + 2);
+    EstimatedColumns = std::max(EstimatedColumns, NumNewVars + 2);
   };
 
   UpdateEstimate(Pred, A, B);
@@ -1756,16 +1768,13 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
 static std::tuple<State, unsigned, unsigned>
 dryRun(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) {
   DT.updateDFSNumbers();
-  SmallVector<Value *> FunctionArgs;
-  for (Value &Arg : F.args())
-    FunctionArgs.push_back(&Arg);
   State S(DT, LI, SE);
-  unsigned EstimatedColumns = FunctionArgs.size() + 1;
 
   // EstimatedRowsA corresponds to SignedCS, and EstimatedRowsB corresponds to
   // UnsignedCS.
   unsigned EstimatedRowsA = 0, EstimatedRowsB = 1;
-  ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
+  unsigned EstimatedColumns = 1;
+  ConstraintInfo Info(F.getDataLayout(), {});
 
   // First, collect conditions implied by branches and blocks with their
   // Dominator DFS in and out numbers.

From f0d674c9f5aa601c0e5472f208ff96f3d855d9b6 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 22 Jul 2024 16:47:42 +0100
Subject: [PATCH 4/6] ConstraintElim: idea for optimizing dry-run further

---
 .../Scalar/ConstraintElimination.cpp          | 56 ++++++++++---------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 334fb8e5d6752..fdd463273edd1 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1682,13 +1682,11 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
                           ConstraintInfo &Info, unsigned &EstimatedRowsA,
                           unsigned &EstimatedRowsB,
                           unsigned &EstimatedColumns) {
+  // What follows is a simplified dry-run of Info.getConstraint and addFact.
   auto UpdateEstimate = [&Info, &EstimatedRowsA, &EstimatedRowsB,
                          &EstimatedColumns](CmpInst::Predicate Pred, Value *A,
                                             Value *B) {
-    // What follows is a simplified dry-run of Info.getConstraint and addFact.
-    unsigned NumNewVars = 0;
     bool IsSigned = false;
-
     switch (Pred) {
     case CmpInst::ICMP_UGT:
     case CmpInst::ICMP_UGE:
@@ -1703,35 +1701,41 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
       return;
     }
 
-    SmallVector<ConditionTy, 4> Preconditions;
-    auto &Value2Index = Info.getValue2Index(IsSigned);
-    Value *AStrip = A->stripPointerCastsSameRepresentation();
-    Value *BStrip = B->stripPointerCastsSameRepresentation();
-    SmallVector<DecompEntry> AVars, BVars;
-
-    if (!Value2Index.contains(AStrip)) {
-      AVars =
-          decompose(AStrip, Preconditions, IsSigned, Info.getDataLayout()).Vars;
-      Value2Index.insert({AStrip, Value2Index.size() + 1});
-    }
-    if (!Value2Index.contains(BStrip)) {
-      BVars =
-          decompose(BStrip, Preconditions, IsSigned, Info.getDataLayout()).Vars;
-      Value2Index.insert({BStrip, Value2Index.size() + 1});
-    }
+    // We abuse the Value2Index map by storing a map of Value -> size of
+    // decompose(Value).Vars. We do this to avoid wastefully calling decompose
+    // on chains we have already seen.
+    auto &DecompSizes = Info.getValue2Index(IsSigned);
+    const auto &DL = Info.getDataLayout();
+
+    auto GetDecompSize = [&DL, &DecompSizes, IsSigned](Value *V) {
+      if (!DecompSizes.contains(V)) {
+        SmallVector<ConditionTy, 4> Preconditions;
+        auto Vars = decompose(V, Preconditions, IsSigned, DL).Vars;
+        for (const auto &[Idx, KV] : enumerate(Vars))
+          // decompose matches the passed Value against some pattern, and
+          // recursively calls itself on the matched inner values, merging
+          // results into the final chain. We conservatively estimate that any
+          // sub-chain in the decompose chain will have the length of the chain
+          // minus the index: the results of two sub-chains could be merged,
+          // leading to a conservative estimate. This is however, not a problem
+          // in practice, as these conservative estimates are anyway less than
+          // the the size of the total chain, and std::max on the estimated
+          // columns will ignore these smaller estiamtes.
+          DecompSizes.insert({KV.Variable, Vars.size() - Idx});
+        DecompSizes.insert({V, Vars.size()});
+      }
+      return DecompSizes.at(V);
+    };
 
-    for (const auto &KV : concat<DecompEntry>(AVars, BVars)) {
-      if (KV.Variable == AStrip || KV.Variable == BStrip ||
-          !Value2Index.contains(KV.Variable))
-        ++NumNewVars;
-    }
+    unsigned ASize = GetDecompSize(A->stripPointerCastsSameRepresentation());
+    unsigned BSize = GetDecompSize(B->stripPointerCastsSameRepresentation());
 
     if (IsSigned)
       ++EstimatedRowsA;
     else
       ++EstimatedRowsB;
 
-    EstimatedColumns = std::max(EstimatedColumns, NumNewVars + 2);
+    EstimatedColumns = std::max(EstimatedColumns, ASize + BSize + 2);
   };
 
   UpdateEstimate(Pred, A, B);
@@ -1764,7 +1768,7 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
 }
 
 /// Performs a dry run of the transform, computing a conservative estimate of
-/// the total number of columns we need in the underlying storage.
+/// the total number of rows and columns we need in the underlying storage.
 static std::tuple<State, unsigned, unsigned>
 dryRun(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) {
   DT.updateDFSNumbers();

From 91468794b11071707b53fddec6bc6d757ea9302f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 22 Jul 2024 18:45:07 +0100
Subject: [PATCH 5/6] ConstraintElim: function args have trivial decomp

---
 .../lib/Transforms/Scalar/ConstraintElimination.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index fdd463273edd1..40d117d49489e 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1704,7 +1704,7 @@ static void dryRunAddFact(CmpInst::Predicate Pred, Value *A, Value *B,
     // We abuse the Value2Index map by storing a map of Value -> size of
     // decompose(Value).Vars. We do this to avoid wastefully calling decompose
     // on chains we have already seen.
-    auto &DecompSizes = Info.getValue2Index(IsSigned);
+    auto &DecompSizes = Info.getValue2Index(true);
     const auto &DL = Info.getDataLayout();
 
     auto GetDecompSize = [&DL, &DecompSizes, IsSigned](Value *V) {
@@ -1777,9 +1777,18 @@ dryRun(Function &F, DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE) {
   // EstimatedRowsA corresponds to SignedCS, and EstimatedRowsB corresponds to
   // UnsignedCS.
   unsigned EstimatedRowsA = 0, EstimatedRowsB = 1;
-  unsigned EstimatedColumns = 1;
+  unsigned EstimatedColumns = F.arg_end() - F.arg_begin() + 1;
   ConstraintInfo Info(F.getDataLayout(), {});
 
+  // We abuse the Value2Index map by storing a map of Value -> size of
+  // decompose(Value).Vars. We do this to avoid wastefully calling decompose
+  // on chains we have already seen.
+  //
+  // Function arguments don't decompose into anything non-trivial.
+  auto &DecompSizes = Info.getValue2Index(true);
+  for (Value &A : F.args())
+    DecompSizes.insert({&A, 1});
+
   // First, collect conditions implied by branches and blocks with their
   // Dominator DFS in and out numbers.
   for (BasicBlock &BB : F) {

From 0a2091a11cb0dee1ca6af72e6644050e8ef37f70 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 29 Jul 2024 16:43:32 +0100
Subject: [PATCH 6/6] ConstraintElim: try 1.6 factor

---
 llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 40d117d49489e..d494d75323d41 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1897,8 +1897,8 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
   const auto &[S, EstimatedRows, EstimatedColumns] = dryRun(F, DT, LI, SE);
 
   // Fail early if estimates exceed limits. Row estimate could be off by up to
-  // 40%.
-  if (EstimatedRows > 1.4 * MaxRows || EstimatedColumns > MaxColumns)
+  // 60%.
+  if (EstimatedRows > 1.6 * MaxRows || EstimatedColumns > MaxColumns)
     return false;
 
   SmallVector<Value *> FunctionArgs;