From 618e0784a95512fc0f3fe5b60b21133a28ccf76c Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@nvidia.com>
Date: Fri, 5 Dec 2025 16:56:39 -0600
Subject: [PATCH] [DA] batch delinearization

This patch adds support for batch delinearization in DependenceAnalysis,
similar to how Polly processes delinearization. Instead of analyzing pairs
of memory accesses independently, this approach:

1. Collects all memory accesses in the function, grouped by base pointer.
2. For each base pointer, collects delinearization terms from ALL accesses.
3. Computes array dimensions once using all available terms.
4. Caches the results for use during pairwise dependence analysis.

This leads to better precision because more terms are available when
inferring array dimensions, especially for parametric arrays where
dimension information may be spread across multiple accesses.

The BatchDelinearization class is defined in Delinearization.h/cpp as a
standalone module (similar to BatchAAResults pattern). Callers (LoopFuse,
LoopUnrollAndJam, LoopInterchange, DDG) create BatchDelinearization
themselves and pass it to DependenceInfo.
---
 llvm/include/llvm/Analysis/Delinearization.h  |  85 ++++++++++
 .../llvm/Analysis/DependenceAnalysis.h        |  14 +-
 llvm/lib/Analysis/DDG.cpp                     |   5 +-
 llvm/lib/Analysis/Delinearization.cpp         | 137 ++++++++++++++++
 llvm/lib/Analysis/DependenceAnalysis.cpp      |  77 ++++++++-
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |   4 +
 .../lib/Transforms/Scalar/LoopInterchange.cpp |   5 +-
 .../Scalar/LoopUnrollAndJamPass.cpp           |   5 +-
 llvm/test/Analysis/DDG/basic-loopnest.ll      |   2 -
 .../BatchDelinearization.ll                   | 147 ++++++++++++++++++
 llvm/test/Transforms/LICM/lnicm.ll            |   3 -
 .../loop-interchange-optimization-remarks.ll  |  29 ++--
 .../LoopInterchange/outer-dependency-lte.ll   |   2 +-
 13 files changed, 484 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll
diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h
index 8fb30925b1ba7..ecac844ea7658 100644
--- a/llvm/include/llvm/Analysis/Delinearization.h
+++ b/llvm/include/llvm/Analysis/Delinearization.h
@@ -16,16 +16,23 @@
 #ifndef LLVM_ANALYSIS_DELINEARIZATION_H
 #define LLVM_ANALYSIS_DELINEARIZATION_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
+class Function;
 class raw_ostream;
 template <typename T> class SmallVectorImpl;
 class GetElementPtrInst;
 class Instruction;
+class LoopInfo;
 class ScalarEvolution;
 class SCEV;
+class SCEVUnknown;
 
 /// Compute the array dimensions Sizes from the set of Terms extracted from
 /// the memory access function of this SCEVAddRecExpr (second step of
@@ -164,6 +171,84 @@ bool getIndexExpressionsFromGEP(ScalarEvolution &SE,
                                 SmallVectorImpl<const SCEV *> &Subscripts,
                                 SmallVectorImpl<const SCEV *> &Sizes);
 
+/// BatchDelinearization - A wrapper for batch delinearization that caches
+/// results across multiple queries. Similar to BatchAAResults, this class
+/// should be used when analyzing multiple memory accesses to the same base
+/// pointers, as it computes array dimensions once using terms from all
+/// accesses, leading to better precision.
+///
+/// This class collects all memory accesses in a function, groups them by base
+/// pointer, and computes array dimensions for each base pointer using terms
+/// from all accesses. The results are cached for efficient lookups during
+/// dependence analysis.
+///
+/// Usage:
+///   BatchDelinearization BD(F, SE, LI);
+///   BD.populate();  // Compute and cache delinearization info.
+///   // Then pass BD to DependenceInfo or query it directly.
+class LLVM_ABI BatchDelinearization {
+public:
+  BatchDelinearization(Function &F, ScalarEvolution &SE, LoopInfo &LI)
+      : F(F), SE(SE), LI(LI) {}
+
+  /// Populate the cache with delinearization information for all memory
+  /// accesses in the function.
+  void populate();
+
+  /// Check if the cache has been populated.
+  bool isPopulated() const { return Populated; }
+
+  /// Get the cached array sizes for a base pointer.
+  /// Returns nullptr if not found.
+  const SmallVector<const SCEV *, 4> *
+  getArraySizes(const SCEVUnknown *Base) const {
+    auto It = ArraySizes.find(Base);
+    return It != ArraySizes.end() ? &It->second : nullptr;
+  }
+
+  /// Get the cached subscripts for an instruction.
+  /// Returns nullptr if not found.
+  const SmallVector<const SCEV *, 4> *
+  getSubscripts(const Instruction *I) const {
+    auto It = Subscripts.find(I);
+    return It != Subscripts.end() ? &It->second : nullptr;
+  }
+
+  /// Get the cached element size for a base pointer.
+  /// Returns nullptr if not found.
+  const SCEV *getElementSize(const SCEVUnknown *Base) const {
+    auto It = ElementSizes.find(Base);
+    return It != ElementSizes.end() ? It->second : nullptr;
+  }
+
+  /// Get the ScalarEvolution instance.
+  ScalarEvolution &getSE() { return SE; }
+  const ScalarEvolution &getSE() const { return SE; }
+
+  /// Get the LoopInfo instance.
+  LoopInfo &getLI() { return LI; }
+  const LoopInfo &getLI() const { return LI; }
+
+private:
+  Function &F;
+  ScalarEvolution &SE;
+  LoopInfo &LI;
+
+  /// Map from base pointer to computed array dimension sizes.
+  SmallDenseMap<const SCEVUnknown *, SmallVector<const SCEV *, 4>, 8>
+      ArraySizes;
+
+  /// Map from instruction to pre-computed subscripts.
+  SmallDenseMap<const Instruction *, SmallVector<const SCEV *, 4>, 16>
+      Subscripts;
+
+  /// Element size for the array (used for validation).
+  SmallDenseMap<const SCEVUnknown *, const SCEV *, 8> ElementSizes;
+
+  /// Flag indicating whether the cache has been populated.
+  bool Populated = false;
+};
+
 struct DelinearizationPrinterPass
     : public PassInfoMixin<DelinearizationPrinterPass> {
   explicit DelinearizationPrinterPass(raw_ostream &OS);
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 6dec24fc9f104..21828290fcc08 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -49,6 +49,7 @@
 namespace llvm {
 class AAResults;
 template <typename T> class ArrayRef;
+class BatchDelinearization;
 class Loop;
 class LoopInfo;
 class SCEVConstant;
@@ -335,8 +336,9 @@ class LLVM_ABI FullDependence final : public Dependence {
 /// DependenceInfo - This class is the main dependence-analysis driver.
 class DependenceInfo {
 public:
-  DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI)
-      : AA(AA), SE(SE), LI(LI), F(F) {}
+  DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI,
+                 BatchDelinearization *BD = nullptr)
+      : AA(AA), SE(SE), LI(LI), F(F), BatchDelin(BD) {}
 
   /// Handle transitive invalidation when the cached analysis results go away.
   LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA,
@@ -355,11 +357,19 @@ class DependenceInfo {
 
   Function *getFunction() const { return F; }
 
+  /// setBatchDelinearization - Set the BatchDelinearization instance to use
+  /// for cached delinearization results.
+  void setBatchDelinearization(BatchDelinearization *BD) { BatchDelin = BD; }
+
+  /// getBatchDelinearization - Get the BatchDelinearization instance.
+  BatchDelinearization *getBatchDelinearization() const { return BatchDelin; }
+
 private:
   AAResults *AA;
   ScalarEvolution *SE;
   LoopInfo *LI;
   Function *F;
+  BatchDelinearization *BatchDelin;
 
   /// Subscript - This private struct represents a pair of subscripts from
   /// a pair of potentially multi-dimensional array references. We use a
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 0907a7fb021fc..d76f993ab5332 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/DDG.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Support/CommandLine.h"
@@ -308,7 +309,9 @@ bool DDGBuilder::shouldCreatePiBlocks() const { return CreatePiBlocks; }
 DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                                      LoopStandardAnalysisResults &AR) {
   Function *F = L.getHeader()->getParent();
-  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
+  BatchDelinearization BD(*F, AR.SE, AR.LI);
+  BD.populate();
+  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI, &BD);
   return std::make_unique<DataDependenceGraph>(L, AR.LI, DI);
 }
 AnalysisKey DDGAnalysis::Key;
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index 7bf83ccf9c172..5a386840231ed 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -960,3 +960,140 @@ PreservedAnalyses DelinearizationPrinterPass::run(Function &F,
                        &AM.getResult<ScalarEvolutionAnalysis>(F));
   return PreservedAnalyses::all();
 }
+
+//===----------------------------------------------------------------------===//
+// BatchDelinearization Implementation
+//===----------------------------------------------------------------------===//
+
+/// Return true for a Load or Store instruction.
+static bool isLoadOrStore(const Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I);
+}
+
+void BatchDelinearization::populate() {
+  if (Populated)
+    return;
+
+  Populated = true;
+
+  // Step 1: Collect all memory accesses grouped by base pointer.
+  // Map from base pointer to list of (Instruction, AccessFunction) pairs.
+  SmallDenseMap<const SCEVUnknown *,
+                SmallVector<std::pair<Instruction *, const SCEV *>, 4>, 8>
+      AccessesByBase;
+
+  for (Instruction &I : instructions(F)) {
+    if (!isLoadOrStore(&I))
+      continue;
+
+    Value *Ptr = getLoadStorePointerOperand(&I);
+    Loop *L = LI.getLoopFor(I.getParent());
+    const SCEV *AccessFn = SE.getSCEVAtScope(Ptr, L);
+    const SCEVUnknown *Base =
+        dyn_cast<SCEVUnknown>(SE.getPointerBase(AccessFn));
+
+    if (!Base)
+      continue;
+
+    // Only consider accesses where the base is loop invariant.
+    if (L && !SE.isLoopInvariant(Base, L))
+      continue;
+
+    AccessesByBase[Base].push_back({&I, AccessFn});
+  }
+
+  // Step 2: For each base pointer, collect terms from ALL accesses and
+  // compute array dimensions once.
+  for (auto &Entry : AccessesByBase) {
+    const SCEVUnknown *Base = Entry.first;
+    auto &Accesses = Entry.second;
+
+    // Skip if there's only one access - no benefit from batch processing.
+    if (Accesses.size() < 2)
+      continue;
+
+    // Determine element size - use the smallest among all accesses.
+    const SCEV *ElemSize = nullptr;
+    for (auto &Access : Accesses) {
+      const SCEV *EltSize = SE.getElementSize(Access.first);
+      if (!ElemSize)
+        ElemSize = EltSize;
+      else if (SE.isKnownPredicate(ICmpInst::ICMP_ULT, EltSize, ElemSize))
+        ElemSize = EltSize;
+    }
+
+    if (!ElemSize)
+      continue;
+
+    ElementSizes[Base] = ElemSize;
+
+    // Collect parametric terms from all accesses to this base.
+    SmallVector<const SCEV *, 8> Terms;
+    for (auto &Access : Accesses) {
+      const SCEV *AccessFn = Access.second;
+      const SCEV *OffsetSCEV = SE.getMinusSCEV(AccessFn, Base);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffsetSCEV);
+      if (AR && AR->isAffine())
+        collectParametricTerms(SE, AR, Terms);
+    }
+
+    // Find array dimensions using all collected terms.
+    SmallVector<const SCEV *, 4> Sizes;
+    findArrayDimensions(SE, Terms, Sizes, ElemSize);
+
+    // Skip if we couldn't determine dimensions.
+    if (Sizes.size() < 2)
+      continue;
+
+    ArraySizes[Base] = Sizes;
+
+    // Pre-compute subscripts for each access using parametric sizes.
+    for (auto &Access : Accesses) {
+      Instruction *Inst = Access.first;
+      const SCEV *AccessFn = Access.second;
+      const SCEV *OffsetSCEV = SE.getMinusSCEV(AccessFn, Base);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OffsetSCEV);
+
+      if (!AR || !AR->isAffine())
+        continue;
+
+      SmallVector<const SCEV *, 4> Subs;
+      computeAccessFunctions(SE, AR, Subs, Sizes);
+
+      if (Subs.size() >= 2)
+        Subscripts[Inst] = std::move(Subs);
+    }
+  }
+
+  // Step 3: Try fixed-size array delinearization for accesses not yet cached.
+  // This handles arrays with known compile-time dimensions.
+  for (auto &Entry : AccessesByBase) {
+    auto &Accesses = Entry.second;
+
+    for (auto &Access : Accesses) {
+      Instruction *Inst = Access.first;
+
+      // Skip if already cached from parametric delinearization.
+      if (Subscripts.count(Inst))
+        continue;
+
+      const SCEV *AccessFn = Access.second;
+      const SCEV *ElemSize = SE.getElementSize(Inst);
+      SmallVector<const SCEV *, 4> Subs, Sizes;
+
+      if (delinearizeFixedSizeArray(SE, SE.removePointerBase(AccessFn), Subs,
+                                    Sizes, ElemSize) &&
+          Subs.size() >= 2) {
+        Subscripts[Inst] = std::move(Subs);
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Batch delinearization cache populated:\n";
+    dbgs() << "  Base pointers with cached dimensions: " << ArraySizes.size()
+           << "\n";
+    dbgs() << "  Instructions with cached subscripts: " << Subscripts.size()
+           << "\n";
+  });
+}
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 9b9c80a9b3266..a72ae360de464 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -3255,6 +3255,19 @@ bool DependenceInfo::tryDelinearizeFixedSize(
            "expected src and dst scev unknowns to be equal");
   });
 
+  // Try to use cached subscripts from BatchDelinearization.
+  if (BatchDelin && BatchDelin->isPopulated()) {
+    const auto *SrcSubs = BatchDelin->getSubscripts(Src);
+    const auto *DstSubs = BatchDelin->getSubscripts(Dst);
+    if (SrcSubs && DstSubs && SrcSubs->size() >= 2 && DstSubs->size() >= 2 &&
+        SrcSubs->size() == DstSubs->size()) {
+      SrcSubscripts.assign(SrcSubs->begin(), SrcSubs->end());
+      DstSubscripts.assign(DstSubs->begin(), DstSubs->end());
+      LLVM_DEBUG(dbgs() << "Using cached fixed-size delinearization results\n");
+      return true;
+    }
+  }
+
   const SCEV *ElemSize = SE->getElementSize(Src);
   assert(ElemSize == SE->getElementSize(Dst) && "Different element sizes");
   SmallVector<const SCEV *, 4> SrcSizes, DstSizes;
@@ -3328,16 +3341,78 @@ bool DependenceInfo::tryDelinearizeParametricSize(
   if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
     return false;
 
+  SmallVector<const SCEV *, 4> Sizes;
+
+  // Try to use cached results from BatchDelinearization.
+  // This provides better precision by using terms from all accesses.
+  if (BatchDelin && BatchDelin->isPopulated()) {
+    const auto *CachedSizes = BatchDelin->getArraySizes(SrcBase);
+    if (CachedSizes) {
+      // Check element size compatibility.
+      const SCEV *CachedElemSize = BatchDelin->getElementSize(SrcBase);
+      if (CachedElemSize && CachedElemSize == ElementSize) {
+        Sizes.assign(CachedSizes->begin(), CachedSizes->end());
+
+        // Try to use pre-computed subscripts if available.
+        const auto *SrcSubs = BatchDelin->getSubscripts(Src);
+        const auto *DstSubs = BatchDelin->getSubscripts(Dst);
+        if (SrcSubs && DstSubs) {
+          SrcSubscripts.assign(SrcSubs->begin(), SrcSubs->end());
+          DstSubscripts.assign(DstSubs->begin(), DstSubs->end());
+
+          if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 &&
+              SrcSubscripts.size() == DstSubscripts.size()) {
+            LLVM_DEBUG(dbgs() << "Using cached delinearization results\n");
+
+            // Validate the cached subscripts.
+            if (!DisableDelinearizationChecks)
+              if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts,
+                                                 SrcPtr) ||
+                  !validateDelinearizationResult(*SE, Sizes, DstSubscripts,
+                                                 DstPtr))
+                return false;
+
+            return true;
+          }
+        }
+
+        // Cache had sizes but not pre-computed subscripts for these
+        // instructions, or pre-computed subscripts failed validation.
+        // Compute subscripts using cached sizes.
+        LLVM_DEBUG(dbgs() << "Using cached array sizes for delinearization\n");
+        SrcSubscripts.clear();
+        DstSubscripts.clear();
+        computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes);
+        computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes);
+
+        if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 &&
+            SrcSubscripts.size() == DstSubscripts.size()) {
+          if (!DisableDelinearizationChecks)
+            if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts,
+                                               SrcPtr) ||
+                !validateDelinearizationResult(*SE, Sizes, DstSubscripts,
+                                               DstPtr))
+              return false;
+
+          return true;
+        }
+      }
+    }
+  }
+
+  // Fall back to pairwise delinearization.
   // First step: collect parametric terms in both array references.
   SmallVector<const SCEV *, 4> Terms;
   collectParametricTerms(*SE, SrcAR, Terms);
   collectParametricTerms(*SE, DstAR, Terms);
 
   // Second step: find subscript sizes.
-  SmallVector<const SCEV *, 4> Sizes;
+  Sizes.clear();
   findArrayDimensions(*SE, Terms, Sizes, ElementSize);
 
   // Third step: compute the access functions for each subscript.
+  SrcSubscripts.clear();
+  DstSubscripts.clear();
   computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes);
   computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes);
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 9ffa602416b05..be74482aedfa0 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Transforms/Scalar/LoopFuse.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -2143,6 +2144,9 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &DI = AM.getResult<DependenceAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  BatchDelinearization BD(F, SE, LI);
+  BD.populate();
+  DI.setBatchDelinearization(&BD);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 330b4abb9942f..bef285332a6bd 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -2139,7 +2140,9 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
            << "Computed dependence info, invoking the transform.";
   });
 
-  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  BatchDelinearization BD(F, AR.SE, AR.LI);
+  BD.populate();
+  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI, &BD);
   if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
     return PreservedAnalyses::all();
   U.markLoopNestChanged(true);
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 4fe74c7c3bbcd..34e5b443608b1 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/Delinearization.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -457,7 +458,9 @@ PreservedAnalyses LoopUnrollAndJamPass::run(LoopNest &LN,
                                             LPMUpdater &U) {
   Function &F = *LN.getParent();
 
-  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
+  BatchDelinearization BD(F, AR.SE, AR.LI);
+  BD.populate();
+  DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI, &BD);
   OptimizationRemarkEmitter ORE(&F);
 
   bool AnyLoopRemoved = false;
diff --git a/llvm/test/Analysis/DDG/basic-loopnest.ll b/llvm/test/Analysis/DDG/basic-loopnest.ll
index 75efff570048b..61003298438f6 100644
--- a/llvm/test/Analysis/DDG/basic-loopnest.ll
+++ b/llvm/test/Analysis/DDG/basic-loopnest.ll
@@ -1,7 +1,5 @@
 ; RUN: opt < %s -disable-output "-passes=print<ddg>" 2>&1 | FileCheck %s
 
-; XFAIL: *
-; At the moment, DependenceAnalysis cannot infer `n` to be positive.
 
 
 ; CHECK-LABEL: 'DDG' for loop 'test1.for.cond1.preheader':
diff --git a/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll
new file mode 100644
index 0000000000000..27ef4c6db3f27
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
+; RUN: | FileCheck %s
+
+; Test case for batch delinearization. When multiple accesses to the same
+; base pointer are analyzed together, terms from all accesses are collected
+; to determine array dimensions, leading to better precision.
+;
+; This test has three accesses to array A:
+;   A[i*m + j]  (in the write)
+;   A[i*m + j]  (in the read)
+;   A[k*m + l]  (third access that provides additional context)
+;
+; The third access helps provide more terms for delinearization,
+; which can improve precision when analyzing the first two accesses.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Three accesses to the same 2D array A[n][m].
+; Batch delinearization collects terms from all accesses.
+define void @batch_delin_test(i64 %n, i64 %m, ptr nocapture %A) {
+; CHECK-LABEL: 'batch_delin_test'
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load1 = load double, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - anti [* *|<]!
+; CHECK-NEXT:  Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [<> *]!
+; CHECK-NEXT:  Src: store double %add, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:  Src: store double %add, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - flow [<> *]!
+; CHECK-NEXT:  Src: %load2 = load double, ptr %arrayidx2, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+;
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  %cmp2 = icmp sgt i64 %m, 0
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %loop.i.preheader, label %exit
+
+loop.i.preheader:
+  br label %loop.i
+
+loop.i:
+  %i = phi i64 [ 0, %loop.i.preheader ], [ %i.next, %loop.i.latch ]
+  br label %loop.j
+
+loop.j:
+  %j = phi i64 [ 0, %loop.i ], [ %j.next, %loop.j ]
+  ; Compute linear index: i*m + j
+  %mul1 = mul nsw i64 %i, %m
+  %idx1 = add nsw i64 %mul1, %j
+  %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1
+  ; First access: load A[i*m + j]
+  %load1 = load double, ptr %arrayidx1, align 8
+  %add = fadd double %load1, 1.0
+  ; Second access: store A[i*m + j]
+  store double %add, ptr %arrayidx1, align 8
+  ; Third access at a different index: load A[(i+1)*m + j]
+  ; This provides additional terms for delinearization.
+  %i_plus_1 = add nsw i64 %i, 1
+  %mul2 = mul nsw i64 %i_plus_1, %m
+  %idx2 = add nsw i64 %mul2, %j
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2
+  %load2 = load double, ptr %arrayidx2, align 8
+  %j.next = add nuw nsw i64 %j, 1
+  %j.cond = icmp slt i64 %j.next, %m
+  br i1 %j.cond, label %loop.j, label %loop.i.latch
+
+loop.i.latch:
+  %i.next = add nuw nsw i64 %i, 1
+  %i.cond = icmp slt i64 %i.next, %n
+  br i1 %i.cond, label %loop.i, label %exit
+
+exit:
+  ret void
+}
+
+; Test with parametric sizes where batch delinearization helps.
+; Two separate loop nests accessing the same array.
+define void @batch_delin_two_nests(i64 %n, i64 %m, ptr nocapture %A) {
+; CHECK-LABEL: 'batch_delin_two_nests'
+; CHECK-NEXT:  Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: store double 1.000000e+00, ptr %arrayidx1, align 8
+; CHECK-NEXT:    da analyze - output [* *]!
+; CHECK-NEXT:  Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - flow [|<]!
+; CHECK-NEXT:  Src: %load = load double, ptr %arrayidx2, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8
+; CHECK-NEXT:    da analyze - input [* *]!
+;
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  %cmp2 = icmp sgt i64 %m, 0
+  %cond = and i1 %cmp1, %cmp2
+  br i1 %cond, label %nest1.i.preheader, label %exit
+
+; First loop nest: stores to A[i*m + j]
+nest1.i.preheader:
+  br label %nest1.i
+
+nest1.i:
+  %i1 = phi i64 [ 0, %nest1.i.preheader ], [ %i1.next, %nest1.i.latch ]
+  br label %nest1.j
+
+nest1.j:
+  %j1 = phi i64 [ 0, %nest1.i ], [ %j1.next, %nest1.j ]
+  %mul1 = mul nsw i64 %i1, %m
+  %idx1 = add nsw i64 %mul1, %j1
+  %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1
+  store double 1.0, ptr %arrayidx1, align 8
+  %j1.next = add nuw nsw i64 %j1, 1
+  %j1.cond = icmp slt i64 %j1.next, %m
+  br i1 %j1.cond, label %nest1.j, label %nest1.i.latch
+
+nest1.i.latch:
+  %i1.next = add nuw nsw i64 %i1, 1
+  %i1.cond = icmp slt i64 %i1.next, %n
+  br i1 %i1.cond, label %nest1.i, label %nest2.i.preheader
+
+; Second loop nest: reads from A[k*m + l]
+nest2.i.preheader:
+  br label %nest2.i
+
+nest2.i:
+  %i2 = phi i64 [ 0, %nest2.i.preheader ], [ %i2.next, %nest2.i.latch ]
+  br label %nest2.j
+
+nest2.j:
+  %j2 = phi i64 [ 0, %nest2.i ], [ %j2.next, %nest2.j ]
+  %mul2 = mul nsw i64 %i2, %m
+  %idx2 = add nsw i64 %mul2, %j2
+  %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2
+  %load = load double, ptr %arrayidx2, align 8
+  %j2.next = add nuw nsw i64 %j2, 1
+  %j2.cond = icmp slt i64 %j2.next, %m
+  br i1 %j2.cond, label %nest2.j, label %nest2.i.latch
+
+nest2.i.latch:
+  %i2.next = add nuw nsw i64 %i2, 1
+  %i2.cond = icmp slt i64 %i2.next, %n
+  br i1 %i2.cond, label %nest2.i, label %exit
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LICM/lnicm.ll b/llvm/test/Transforms/LICM/lnicm.ll
index e331ab7d39e83..814f964666305 100644
--- a/llvm/test/Transforms/LICM/lnicm.ll
+++ b/llvm/test/Transforms/LICM/lnicm.ll
@@ -3,9 +3,6 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LNICM
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LICM
 
-; XFAIL: *
-; Loop interchange currently fails due to a failure in dependence analysis.
-
 ; This test represents the following function:
 ; void test(int n, int m, int x[m][n], int y[n], int *z) {
 ;   for (int k = 0; k < n; k++) {
diff --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
index 14836ba73433d..a5cd1cb924e84 100644
--- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
+++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll
@@ -58,21 +58,17 @@ for.end19:
   ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly.
+; The interchange is still not profitable, but for a different reason.
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        test01
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Computed dependence info, invoking the transform.
-; CHECK-NEXT: ...
 
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            InterchangeNotProfitable
 ; CHECK-NEXT: Function:        test01
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Analysis
 ; DELIN-NEXT: Pass:            loop-interchange
@@ -134,21 +130,17 @@ define void @test02(i32 %k, i32 %N) {
    ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly
+; and the loop can be interchanged (same behavior as DELIN).
 ; CHECK: --- !Analysis
 ; CHECK-NEXT: Pass:            loop-interchange
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        test02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          Computed dependence info, invoking the transform.
-; CHECK-NEXT: ...
 
-; CHECK: --- !Missed
+; CHECK: --- !Passed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            Interchanged
 ; CHECK-NEXT: Function:        test02
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Analysis
 ; DELIN-NEXT: Pass:            loop-interchange
@@ -285,13 +277,12 @@ for.end17:
   ret void
 }
 
+; With batch delinearization, the dependences are now computed correctly.
+; The real reason for not interchanging is that loops are not tightly nested.
 ; CHECK: --- !Missed
 ; CHECK-NEXT: Pass:            loop-interchange
-; CHECK-NEXT: Name:            Dependence
+; CHECK-NEXT: Name:            NotTightlyNested
 ; CHECK-NEXT: Function:        test04
-; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
-; CHECK-NEXT: ...
 
 ; DELIN: --- !Missed
 ; DELIN-NEXT: Pass:            loop-interchange
diff --git a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
index 4aba99f35678e..c8e79dc169b1d 100644
--- a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
+++ b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: Name:            Dependence
 ; CHECK-NEXT: Function:        f
 ; CHECK-NEXT: Args:
-; CHECK-NEXT:   - String:          All loops have dependencies in all directions.
+; CHECK-NEXT:   - String:          Cannot interchange loops due to dependences.
 ; CHECK-NEXT: ...