diff --git a/llvm/include/llvm/Analysis/Delinearization.h b/llvm/include/llvm/Analysis/Delinearization.h index 8fb30925b1ba7..ecac844ea7658 100644 --- a/llvm/include/llvm/Analysis/Delinearization.h +++ b/llvm/include/llvm/Analysis/Delinearization.h @@ -16,16 +16,23 @@ #ifndef LLVM_ANALYSIS_DELINEARIZATION_H #define LLVM_ANALYSIS_DELINEARIZATION_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Compiler.h" namespace llvm { +class Function; class raw_ostream; template class SmallVectorImpl; class GetElementPtrInst; class Instruction; +class LoopInfo; class ScalarEvolution; class SCEV; +class SCEVUnknown; /// Compute the array dimensions Sizes from the set of Terms extracted from /// the memory access function of this SCEVAddRecExpr (second step of @@ -164,6 +171,84 @@ bool getIndexExpressionsFromGEP(ScalarEvolution &SE, SmallVectorImpl &Subscripts, SmallVectorImpl &Sizes); +/// BatchDelinearization - A wrapper for batch delinearization that caches +/// results across multiple queries. Similar to BatchAAResults, this class +/// should be used when analyzing multiple memory accesses to the same base +/// pointers, as it computes array dimensions once using terms from all +/// accesses, leading to better precision. +/// +/// This class collects all memory accesses in a function, groups them by base +/// pointer, and computes array dimensions for each base pointer using terms +/// from all accesses. The results are cached for efficient lookups during +/// dependence analysis. +/// +/// Usage: +/// BatchDelinearization BD(F, SE, LI); +/// BD.populate(); // Compute and cache delinearization info. +/// // Then pass BD to DependenceInfo or query it directly. +class LLVM_ABI BatchDelinearization { +public: + BatchDelinearization(Function &F, ScalarEvolution &SE, LoopInfo &LI) + : F(F), SE(SE), LI(LI) {} + + /// Populate the cache with delinearization information for all memory + /// accesses in the function. + void populate(); + + /// Check if the cache has been populated. + bool isPopulated() const { return Populated; } + + /// Get the cached array sizes for a base pointer. + /// Returns nullptr if not found. + const SmallVector * + getArraySizes(const SCEVUnknown *Base) const { + auto It = ArraySizes.find(Base); + return It != ArraySizes.end() ? &It->second : nullptr; + } + + /// Get the cached subscripts for an instruction. + /// Returns nullptr if not found. + const SmallVector * + getSubscripts(const Instruction *I) const { + auto It = Subscripts.find(I); + return It != Subscripts.end() ? &It->second : nullptr; + } + + /// Get the cached element size for a base pointer. + /// Returns nullptr if not found. + const SCEV *getElementSize(const SCEVUnknown *Base) const { + auto It = ElementSizes.find(Base); + return It != ElementSizes.end() ? It->second : nullptr; + } + + /// Get the ScalarEvolution instance. + ScalarEvolution &getSE() { return SE; } + const ScalarEvolution &getSE() const { return SE; } + + /// Get the LoopInfo instance. + LoopInfo &getLI() { return LI; } + const LoopInfo &getLI() const { return LI; } + +private: + Function &F; + ScalarEvolution &SE; + LoopInfo &LI; + + /// Map from base pointer to computed array dimension sizes. + SmallDenseMap, 8> + ArraySizes; + + /// Map from instruction to pre-computed subscripts. + SmallDenseMap, 16> + Subscripts; + + /// Element size for the array (used for validation). + SmallDenseMap ElementSizes; + + /// Flag indicating whether the cache has been populated. + bool Populated = false; +}; + struct DelinearizationPrinterPass : public PassInfoMixin { explicit DelinearizationPrinterPass(raw_ostream &OS); diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h index 6dec24fc9f104..21828290fcc08 100644 --- a/llvm/include/llvm/Analysis/DependenceAnalysis.h +++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h @@ -49,6 +49,7 @@ namespace llvm { class AAResults; template class ArrayRef; +class BatchDelinearization; class Loop; class LoopInfo; class SCEVConstant; @@ -335,8 +336,9 @@ class LLVM_ABI FullDependence final : public Dependence { /// DependenceInfo - This class is the main dependence-analysis driver. class DependenceInfo { public: - DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI) - : AA(AA), SE(SE), LI(LI), F(F) {} + DependenceInfo(Function *F, AAResults *AA, ScalarEvolution *SE, LoopInfo *LI, + BatchDelinearization *BD = nullptr) + : AA(AA), SE(SE), LI(LI), F(F), BatchDelin(BD) {} /// Handle transitive invalidation when the cached analysis results go away. LLVM_ABI bool invalidate(Function &F, const PreservedAnalyses &PA, @@ -355,11 +357,19 @@ class DependenceInfo { Function *getFunction() const { return F; } + /// setBatchDelinearization - Set the BatchDelinearization instance to use + /// for cached delinearization results. + void setBatchDelinearization(BatchDelinearization *BD) { BatchDelin = BD; } + + /// getBatchDelinearization - Get the BatchDelinearization instance. + BatchDelinearization *getBatchDelinearization() const { return BatchDelin; } + private: AAResults *AA; ScalarEvolution *SE; LoopInfo *LI; Function *F; + BatchDelinearization *BatchDelin; /// Subscript - This private struct represents a pair of subscripts from /// a pair of potentially multi-dimensional array references. We use a diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp index 0907a7fb021fc..d76f993ab5332 100644 --- a/llvm/lib/Analysis/DDG.cpp +++ b/llvm/lib/Analysis/DDG.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/DDG.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Support/CommandLine.h" @@ -308,7 +309,9 @@ bool DDGBuilder::shouldCreatePiBlocks() const { return CreatePiBlocks; } DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR) { Function *F = L.getHeader()->getParent(); - DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI); + BatchDelinearization BD(*F, AR.SE, AR.LI); + BD.populate(); + DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI, &BD); return std::make_unique(L, AR.LI, DI); } AnalysisKey DDGAnalysis::Key; diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp index 7bf83ccf9c172..5a386840231ed 100644 --- a/llvm/lib/Analysis/Delinearization.cpp +++ b/llvm/lib/Analysis/Delinearization.cpp @@ -960,3 +960,140 @@ PreservedAnalyses DelinearizationPrinterPass::run(Function &F, &AM.getResult(F)); return PreservedAnalyses::all(); } + +//===----------------------------------------------------------------------===// +// BatchDelinearization Implementation +//===----------------------------------------------------------------------===// + +/// Return true for a Load or Store instruction. +static bool isLoadOrStore(const Instruction *I) { + return isa(I) || isa(I); +} + +void BatchDelinearization::populate() { + if (Populated) + return; + + Populated = true; + + // Step 1: Collect all memory accesses grouped by base pointer. + // Map from base pointer to list of (Instruction, AccessFunction) pairs. + SmallDenseMap, 4>, 8> + AccessesByBase; + + for (Instruction &I : instructions(F)) { + if (!isLoadOrStore(&I)) + continue; + + Value *Ptr = getLoadStorePointerOperand(&I); + Loop *L = LI.getLoopFor(I.getParent()); + const SCEV *AccessFn = SE.getSCEVAtScope(Ptr, L); + const SCEVUnknown *Base = + dyn_cast(SE.getPointerBase(AccessFn)); + + if (!Base) + continue; + + // Only consider accesses where the base is loop invariant. + if (L && !SE.isLoopInvariant(Base, L)) + continue; + + AccessesByBase[Base].push_back({&I, AccessFn}); + } + + // Step 2: For each base pointer, collect terms from ALL accesses and + // compute array dimensions once. + for (auto &Entry : AccessesByBase) { + const SCEVUnknown *Base = Entry.first; + auto &Accesses = Entry.second; + + // Skip if there's only one access - no benefit from batch processing. + if (Accesses.size() < 2) + continue; + + // Determine element size - use the smallest among all accesses. + const SCEV *ElemSize = nullptr; + for (auto &Access : Accesses) { + const SCEV *EltSize = SE.getElementSize(Access.first); + if (!ElemSize) + ElemSize = EltSize; + else if (SE.isKnownPredicate(ICmpInst::ICMP_ULT, EltSize, ElemSize)) + ElemSize = EltSize; + } + + if (!ElemSize) + continue; + + ElementSizes[Base] = ElemSize; + + // Collect parametric terms from all accesses to this base. + SmallVector Terms; + for (auto &Access : Accesses) { + const SCEV *AccessFn = Access.second; + const SCEV *OffsetSCEV = SE.getMinusSCEV(AccessFn, Base); + const SCEVAddRecExpr *AR = dyn_cast(OffsetSCEV); + if (AR && AR->isAffine()) + collectParametricTerms(SE, AR, Terms); + } + + // Find array dimensions using all collected terms. + SmallVector Sizes; + findArrayDimensions(SE, Terms, Sizes, ElemSize); + + // Skip if we couldn't determine dimensions. + if (Sizes.size() < 2) + continue; + + ArraySizes[Base] = Sizes; + + // Pre-compute subscripts for each access using parametric sizes. + for (auto &Access : Accesses) { + Instruction *Inst = Access.first; + const SCEV *AccessFn = Access.second; + const SCEV *OffsetSCEV = SE.getMinusSCEV(AccessFn, Base); + const SCEVAddRecExpr *AR = dyn_cast(OffsetSCEV); + + if (!AR || !AR->isAffine()) + continue; + + SmallVector Subs; + computeAccessFunctions(SE, AR, Subs, Sizes); + + if (Subs.size() >= 2) + Subscripts[Inst] = std::move(Subs); + } + } + + // Step 3: Try fixed-size array delinearization for accesses not yet cached. + // This handles arrays with known compile-time dimensions. + for (auto &Entry : AccessesByBase) { + auto &Accesses = Entry.second; + + for (auto &Access : Accesses) { + Instruction *Inst = Access.first; + + // Skip if already cached from parametric delinearization. + if (Subscripts.count(Inst)) + continue; + + const SCEV *AccessFn = Access.second; + const SCEV *ElemSize = SE.getElementSize(Inst); + SmallVector Subs, Sizes; + + if (delinearizeFixedSizeArray(SE, SE.removePointerBase(AccessFn), Subs, + Sizes, ElemSize) && + Subs.size() >= 2) { + Subscripts[Inst] = std::move(Subs); + } + } + } + + LLVM_DEBUG({ + dbgs() << "Batch delinearization cache populated:\n"; + dbgs() << " Base pointers with cached dimensions: " << ArraySizes.size() + << "\n"; + dbgs() << " Instructions with cached subscripts: " << Subscripts.size() + << "\n"; + }); +} diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 9b9c80a9b3266..a72ae360de464 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3255,6 +3255,19 @@ bool DependenceInfo::tryDelinearizeFixedSize( "expected src and dst scev unknowns to be equal"); }); + // Try to use cached subscripts from BatchDelinearization. + if (BatchDelin && BatchDelin->isPopulated()) { + const auto *SrcSubs = BatchDelin->getSubscripts(Src); + const auto *DstSubs = BatchDelin->getSubscripts(Dst); + if (SrcSubs && DstSubs && SrcSubs->size() >= 2 && DstSubs->size() >= 2 && + SrcSubs->size() == DstSubs->size()) { + SrcSubscripts.assign(SrcSubs->begin(), SrcSubs->end()); + DstSubscripts.assign(DstSubs->begin(), DstSubs->end()); + LLVM_DEBUG(dbgs() << "Using cached fixed-size delinearization results\n"); + return true; + } + } + const SCEV *ElemSize = SE->getElementSize(Src); assert(ElemSize == SE->getElementSize(Dst) && "Different element sizes"); SmallVector SrcSizes, DstSizes; @@ -3328,16 +3341,78 @@ bool DependenceInfo::tryDelinearizeParametricSize( if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine()) return false; + SmallVector Sizes; + + // Try to use cached results from BatchDelinearization. + // This provides better precision by using terms from all accesses. + if (BatchDelin && BatchDelin->isPopulated()) { + const auto *CachedSizes = BatchDelin->getArraySizes(SrcBase); + if (CachedSizes) { + // Check element size compatibility. + const SCEV *CachedElemSize = BatchDelin->getElementSize(SrcBase); + if (CachedElemSize && CachedElemSize == ElementSize) { + Sizes.assign(CachedSizes->begin(), CachedSizes->end()); + + // Try to use pre-computed subscripts if available. + const auto *SrcSubs = BatchDelin->getSubscripts(Src); + const auto *DstSubs = BatchDelin->getSubscripts(Dst); + if (SrcSubs && DstSubs) { + SrcSubscripts.assign(SrcSubs->begin(), SrcSubs->end()); + DstSubscripts.assign(DstSubs->begin(), DstSubs->end()); + + if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 && + SrcSubscripts.size() == DstSubscripts.size()) { + LLVM_DEBUG(dbgs() << "Using cached delinearization results\n"); + + // Validate the cached subscripts. + if (!DisableDelinearizationChecks) + if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts, + SrcPtr) || + !validateDelinearizationResult(*SE, Sizes, DstSubscripts, + DstPtr)) + return false; + + return true; + } + } + + // Cache had sizes but not pre-computed subscripts for these + // instructions, or pre-computed subscripts failed validation. + // Compute subscripts using cached sizes. + LLVM_DEBUG(dbgs() << "Using cached array sizes for delinearization\n"); + SrcSubscripts.clear(); + DstSubscripts.clear(); + computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes); + computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes); + + if (SrcSubscripts.size() >= 2 && DstSubscripts.size() >= 2 && + SrcSubscripts.size() == DstSubscripts.size()) { + if (!DisableDelinearizationChecks) + if (!validateDelinearizationResult(*SE, Sizes, SrcSubscripts, + SrcPtr) || + !validateDelinearizationResult(*SE, Sizes, DstSubscripts, + DstPtr)) + return false; + + return true; + } + } + } + } + + // Fall back to pairwise delinearization. // First step: collect parametric terms in both array references. SmallVector Terms; collectParametricTerms(*SE, SrcAR, Terms); collectParametricTerms(*SE, DstAR, Terms); // Second step: find subscript sizes. - SmallVector Sizes; + Sizes.clear(); findArrayDimensions(*SE, Terms, Sizes, ElementSize); // Third step: compute the access functions for each subscript. + SrcSubscripts.clear(); + DstSubscripts.clear(); computeAccessFunctions(*SE, SrcAR, SrcSubscripts, Sizes); computeAccessFunctions(*SE, DstAR, DstSubscripts, Sizes); diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index 9ffa602416b05..be74482aedfa0 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -47,6 +47,7 @@ #include "llvm/Transforms/Scalar/LoopFuse.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopInfo.h" @@ -2143,6 +2144,9 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult(F); auto &DI = AM.getResult(F); auto &SE = AM.getResult(F); + BatchDelinearization BD(F, SE, LI); + BD.populate(); + DI.setBatchDelinearization(&BD); auto &PDT = AM.getResult(F); auto &ORE = AM.getResult(F); auto &AC = AM.getResult(F); diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 330b4abb9942f..bef285332a6bd 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopCacheAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -2139,7 +2140,9 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, << "Computed dependence info, invoking the transform."; }); - DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + BatchDelinearization BD(F, AR.SE, AR.LI); + BD.populate(); + DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI, &BD); if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN)) return PreservedAnalyses::all(); U.markLoopNestChanged(true); diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 4fe74c7c3bbcd..34e5b443608b1 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/Delinearization.h" #include "llvm/Analysis/DependenceAnalysis.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/Analysis/LoopInfo.h" @@ -457,7 +458,9 @@ PreservedAnalyses LoopUnrollAndJamPass::run(LoopNest &LN, LPMUpdater &U) { Function &F = *LN.getParent(); - DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); + BatchDelinearization BD(F, AR.SE, AR.LI); + BD.populate(); + DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI, &BD); OptimizationRemarkEmitter ORE(&F); bool AnyLoopRemoved = false; diff --git a/llvm/test/Analysis/DDG/basic-loopnest.ll b/llvm/test/Analysis/DDG/basic-loopnest.ll index 75efff570048b..61003298438f6 100644 --- a/llvm/test/Analysis/DDG/basic-loopnest.ll +++ b/llvm/test/Analysis/DDG/basic-loopnest.ll @@ -1,7 +1,5 @@ ; RUN: opt < %s -disable-output "-passes=print" 2>&1 | FileCheck %s -; XFAIL: * -; At the moment, DependenceAnalysis cannot infer `n` to be positive. ; CHECK-LABEL: 'DDG' for loop 'test1.for.cond1.preheader': diff --git a/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll new file mode 100644 index 0000000000000..27ef4c6db3f27 --- /dev/null +++ b/llvm/test/Analysis/DependenceAnalysis/BatchDelinearization.ll @@ -0,0 +1,147 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -disable-output "-passes=print" -aa-pipeline=basic-aa 2>&1 \ +; RUN: | FileCheck %s + +; Test case for batch delinearization. When multiple accesses to the same +; base pointer are analyzed together, terms from all accesses are collected +; to determine array dimensions, leading to better precision. +; +; This test has three accesses to array A: +; A[i*m + j] (in the write) +; A[i*m + j] (in the read) +; A[k*m + l] (third access that provides additional context) +; +; The third access helps provide more terms for delinearization, +; which can improve precision when analyzing the first two accesses. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Three accesses to the same 2D array A[n][m]. +; Batch delinearization collects terms from all accesses. +define void @batch_delin_test(i64 %n, i64 %m, ptr nocapture %A) { +; CHECK-LABEL: 'batch_delin_test' +; CHECK-NEXT: Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load1 = load double, ptr %arrayidx1, align 8 +; CHECK-NEXT: da analyze - input [* *]! +; CHECK-NEXT: Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8 +; CHECK-NEXT: da analyze - anti [* *|<]! +; CHECK-NEXT: Src: %load1 = load double, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8 +; CHECK-NEXT: da analyze - input [<> *]! +; CHECK-NEXT: Src: store double %add, ptr %arrayidx1, align 8 --> Dst: store double %add, ptr %arrayidx1, align 8 +; CHECK-NEXT: da analyze - output [* *]! +; CHECK-NEXT: Src: store double %add, ptr %arrayidx1, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8 +; CHECK-NEXT: da analyze - flow [<> *]! +; CHECK-NEXT: Src: %load2 = load double, ptr %arrayidx2, align 8 --> Dst: %load2 = load double, ptr %arrayidx2, align 8 +; CHECK-NEXT: da analyze - input [* *]! +; +entry: + %cmp1 = icmp sgt i64 %n, 0 + %cmp2 = icmp sgt i64 %m, 0 + %cond = and i1 %cmp1, %cmp2 + br i1 %cond, label %loop.i.preheader, label %exit + +loop.i.preheader: + br label %loop.i + +loop.i: + %i = phi i64 [ 0, %loop.i.preheader ], [ %i.next, %loop.i.latch ] + br label %loop.j + +loop.j: + %j = phi i64 [ 0, %loop.i ], [ %j.next, %loop.j ] + ; Compute linear index: i*m + j + %mul1 = mul nsw i64 %i, %m + %idx1 = add nsw i64 %mul1, %j + %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1 + ; First access: load A[i*m + j] + %load1 = load double, ptr %arrayidx1, align 8 + %add = fadd double %load1, 1.0 + ; Second access: store A[i*m + j] + store double %add, ptr %arrayidx1, align 8 + ; Third access at a different index: load A[(i+1)*m + j] + ; This provides additional terms for delinearization. + %i_plus_1 = add nsw i64 %i, 1 + %mul2 = mul nsw i64 %i_plus_1, %m + %idx2 = add nsw i64 %mul2, %j + %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2 + %load2 = load double, ptr %arrayidx2, align 8 + %j.next = add nuw nsw i64 %j, 1 + %j.cond = icmp slt i64 %j.next, %m + br i1 %j.cond, label %loop.j, label %loop.i.latch + +loop.i.latch: + %i.next = add nuw nsw i64 %i, 1 + %i.cond = icmp slt i64 %i.next, %n + br i1 %i.cond, label %loop.i, label %exit + +exit: + ret void +} + +; Test with parametric sizes where batch delinearization helps. +; Two separate loop nests accessing the same array. +define void @batch_delin_two_nests(i64 %n, i64 %m, ptr nocapture %A) { +; CHECK-LABEL: 'batch_delin_two_nests' +; CHECK-NEXT: Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: store double 1.000000e+00, ptr %arrayidx1, align 8 +; CHECK-NEXT: da analyze - output [* *]! +; CHECK-NEXT: Src: store double 1.000000e+00, ptr %arrayidx1, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8 +; CHECK-NEXT: da analyze - flow [|<]! +; CHECK-NEXT: Src: %load = load double, ptr %arrayidx2, align 8 --> Dst: %load = load double, ptr %arrayidx2, align 8 +; CHECK-NEXT: da analyze - input [* *]! +; +entry: + %cmp1 = icmp sgt i64 %n, 0 + %cmp2 = icmp sgt i64 %m, 0 + %cond = and i1 %cmp1, %cmp2 + br i1 %cond, label %nest1.i.preheader, label %exit + +; First loop nest: stores to A[i*m + j] +nest1.i.preheader: + br label %nest1.i + +nest1.i: + %i1 = phi i64 [ 0, %nest1.i.preheader ], [ %i1.next, %nest1.i.latch ] + br label %nest1.j + +nest1.j: + %j1 = phi i64 [ 0, %nest1.i ], [ %j1.next, %nest1.j ] + %mul1 = mul nsw i64 %i1, %m + %idx1 = add nsw i64 %mul1, %j1 + %arrayidx1 = getelementptr inbounds double, ptr %A, i64 %idx1 + store double 1.0, ptr %arrayidx1, align 8 + %j1.next = add nuw nsw i64 %j1, 1 + %j1.cond = icmp slt i64 %j1.next, %m + br i1 %j1.cond, label %nest1.j, label %nest1.i.latch + +nest1.i.latch: + %i1.next = add nuw nsw i64 %i1, 1 + %i1.cond = icmp slt i64 %i1.next, %n + br i1 %i1.cond, label %nest1.i, label %nest2.i.preheader + +; Second loop nest: reads from A[k*m + l] +nest2.i.preheader: + br label %nest2.i + +nest2.i: + %i2 = phi i64 [ 0, %nest2.i.preheader ], [ %i2.next, %nest2.i.latch ] + br label %nest2.j + +nest2.j: + %j2 = phi i64 [ 0, %nest2.i ], [ %j2.next, %nest2.j ] + %mul2 = mul nsw i64 %i2, %m + %idx2 = add nsw i64 %mul2, %j2 + %arrayidx2 = getelementptr inbounds double, ptr %A, i64 %idx2 + %load = load double, ptr %arrayidx2, align 8 + %j2.next = add nuw nsw i64 %j2, 1 + %j2.cond = icmp slt i64 %j2.next, %m + br i1 %j2.cond, label %nest2.j, label %nest2.i.latch + +nest2.i.latch: + %i2.next = add nuw nsw i64 %i2, 1 + %i2.cond = icmp slt i64 %i2.next, %n + br i1 %i2.cond, label %nest2.i, label %exit + +exit: + ret void +} + diff --git a/llvm/test/Transforms/LICM/lnicm.ll b/llvm/test/Transforms/LICM/lnicm.ll index e331ab7d39e83..814f964666305 100644 --- a/llvm/test/Transforms/LICM/lnicm.ll +++ b/llvm/test/Transforms/LICM/lnicm.ll @@ -3,9 +3,6 @@ ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LNICM ; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)' -cache-line-size=64 -S %s | FileCheck %s --check-prefixes LICM -; XFAIL: * -; Loop interchange currently fails due to a failure in dependence analysis. - ; This test represents the following function: ; void test(int n, int m, int x[m][n], int y[n], int *z) { ; for (int k = 0; k < n; k++) { diff --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll index 14836ba73433d..a5cd1cb924e84 100644 --- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -58,21 +58,17 @@ for.end19: ret void } +; With batch delinearization, the dependences are now computed correctly. +; The interchange is still not profitable, but for a different reason. ; CHECK: --- !Analysis ; CHECK-NEXT: Pass: loop-interchange ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: test01 -; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Computed dependence info, invoking the transform. -; CHECK-NEXT: ... ; CHECK: --- !Missed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Name: InterchangeNotProfitable ; CHECK-NEXT: Function: test01 -; CHECK-NEXT: Args: -; CHECK-NEXT: - String: All loops have dependencies in all directions. -; CHECK-NEXT: ... ; DELIN: --- !Analysis ; DELIN-NEXT: Pass: loop-interchange @@ -134,21 +130,17 @@ define void @test02(i32 %k, i32 %N) { ret void } +; With batch delinearization, the dependences are now computed correctly +; and the loop can be interchanged (same behavior as DELIN). ; CHECK: --- !Analysis ; CHECK-NEXT: Pass: loop-interchange ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: test02 -; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Computed dependence info, invoking the transform. -; CHECK-NEXT: ... -; CHECK: --- !Missed +; CHECK: --- !Passed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Name: Interchanged ; CHECK-NEXT: Function: test02 -; CHECK-NEXT: Args: -; CHECK-NEXT: - String: All loops have dependencies in all directions. -; CHECK-NEXT: ... ; DELIN: --- !Analysis ; DELIN-NEXT: Pass: loop-interchange @@ -285,13 +277,12 @@ for.end17: ret void } +; With batch delinearization, the dependences are now computed correctly. +; The real reason for not interchanging is that loops are not tightly nested. ; CHECK: --- !Missed ; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Name: NotTightlyNested ; CHECK-NEXT: Function: test04 -; CHECK-NEXT: Args: -; CHECK-NEXT: - String: All loops have dependencies in all directions. -; CHECK-NEXT: ... ; DELIN: --- !Missed ; DELIN-NEXT: Pass: loop-interchange diff --git a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll index 4aba99f35678e..c8e79dc169b1d 100644 --- a/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll +++ b/llvm/test/Transforms/LoopInterchange/outer-dependency-lte.ll @@ -22,7 +22,7 @@ ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: f ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: All loops have dependencies in all directions. +; CHECK-NEXT: - String: Cannot interchange loops due to dependences. ; CHECK-NEXT: ...