llvm · kasuga-fj · Jul 8, 2025 · Jul 3, 2025 · Jul 3, 2025 · Jul 7, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -407,6 +407,33 @@ class LoopInterchangeLegality {
   SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
+/// Manages information utilized by the profitability check for cache. The main
+/// purpose of this class is to delay the computation of CacheCost until it is
+/// actually needed.
+class LoopInterchangeCacheCostManager {
+  Loop *OutermostLoop;
+  LoopStandardAnalysisResults *AR;
+  DependenceInfo *DI;
+
+  /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
+  /// that the result can be nullptr.
+  std::optional<std::unique_ptr<CacheCost>> CC;
+
+  /// Maps each loop to an index representing the optimal position within the
+  /// loop-nest, as determined by the cache cost analysis.
+  DenseMap<const Loop *, unsigned> CostMap;
+
+  void computeIfUnitinialized();
+
+public:
+  LoopInterchangeCacheCostManager(Loop *OutermostLoop,
+                                  LoopStandardAnalysisResults *AR,
+                                  DependenceInfo *DI)
+      : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
+  std::unique_ptr<CacheCost> &getCacheCost();
+  const DenseMap<const Loop *, unsigned> &getCostMap();
+};
+
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
 /// loop.
 class LoopInterchangeProfitability {
@@ -419,8 +446,7 @@ class LoopInterchangeProfitability {
   bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
                     unsigned InnerLoopId, unsigned OuterLoopId,
                     CharMatrix &DepMatrix,
-                    const DenseMap<const Loop *, unsigned> &CostMap,
-                    std::unique_ptr<CacheCost> &CC);
+                    LoopInterchangeCacheCostManager &LICCM);
 
 private:
   int getInstrOrderCost();
@@ -477,15 +503,15 @@ struct LoopInterchange {
   LoopInfo *LI = nullptr;
   DependenceInfo *DI = nullptr;
   DominatorTree *DT = nullptr;
-  std::unique_ptr<CacheCost> CC = nullptr;
+  LoopStandardAnalysisResults *AR = nullptr;
 
   /// Interface to emit optimization remarks.
   OptimizationRemarkEmitter *ORE;
 
   LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
-                  DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
+                  DominatorTree *DT, LoopStandardAnalysisResults *AR,
                   OptimizationRemarkEmitter *ORE)
-      : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
+      : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
 
   bool run(Loop *L) {
     if (L->getParentLoop())
@@ -540,19 +566,7 @@ struct LoopInterchange {
     }
 
     unsigned SelecLoopId = selectLoopForInterchange(LoopList);
-    // Obtain the loop vector returned from loop cache analysis beforehand,
-    // and put each <Loop, index> pair into a map for constant time query
-    // later. Indices in loop vector reprsent the optimal order of the
-    // corresponding loop, e.g., given a loopnest with depth N, index 0
-    // indicates the loop should be placed as the outermost loop and index N
-    // indicates the loop should be placed as the innermost loop.
-    //
-    // For the old pass manager CacheCost would be null.
-    DenseMap<const Loop *, unsigned> CostMap;
-    if (CC != nullptr) {
-      for (const auto &[Idx, Cost] : enumerate(CC->getLoopCosts()))
-        CostMap[Cost.first] = Idx;
-    }
+    LoopInterchangeCacheCostManager LICCM(LoopList[0], AR, DI);
     // We try to achieve the globally optimal memory access for the loopnest,
     // and do interchange based on a bubble-sort fasion. We start from
     // the innermost loop, move it outwards to the best possible position
@@ -561,7 +575,7 @@ struct LoopInterchange {
       bool ChangedPerIter = false;
       for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
         bool Interchanged =
-            processLoop(LoopList, i, i - 1, DependencyMatrix, CostMap);
+            processLoop(LoopList, i, i - 1, DependencyMatrix, LICCM);
         ChangedPerIter |= Interchanged;
         Changed |= Interchanged;
       }
@@ -576,7 +590,7 @@ struct LoopInterchange {
   bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
                    unsigned OuterLoopId,
                    std::vector<std::vector<char>> &DependencyMatrix,
-                   const DenseMap<const Loop *, unsigned> &CostMap) {
+                   LoopInterchangeCacheCostManager &LICCM) {
     Loop *OuterLoop = LoopList[OuterLoopId];
     Loop *InnerLoop = LoopList[InnerLoopId];
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
@@ -589,7 +603,7 @@ struct LoopInterchange {
     LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
     LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
     if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
-                          DependencyMatrix, CostMap, CC)) {
+                          DependencyMatrix, LICCM)) {
       LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
       return false;
     }
@@ -1122,6 +1136,36 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
   return true;
 }
 
+void LoopInterchangeCacheCostManager::computeIfUnitinialized() {
+  if (CC.has_value())
+    return;
+
+  LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
+  CC = CacheCost::getCacheCost(*OutermostLoop, *AR, *DI);
+  // Obtain the loop vector returned from loop cache analysis beforehand,
+  // and put each <Loop, index> pair into a map for constant time query
+  // later. Indices in loop vector reprsent the optimal order of the
+  // corresponding loop, e.g., given a loopnest with depth N, index 0
+  // indicates the loop should be placed as the outermost loop and index N
+  // indicates the loop should be placed as the innermost loop.
+  //
+  // For the old pass manager CacheCost would be null.
+  if (*CC != nullptr)
+    for (const auto &[Idx, Cost] : enumerate((*CC)->getLoopCosts()))
+      CostMap[Cost.first] = Idx;
+}
+
+std::unique_ptr<CacheCost> &LoopInterchangeCacheCostManager::getCacheCost() {
+  computeIfUnitinialized();
+  return *CC;
+}
+
+const DenseMap<const Loop *, unsigned> &
+LoopInterchangeCacheCostManager::getCostMap() {
+  computeIfUnitinialized();
+  return CostMap;
+}
+
 int LoopInterchangeProfitability::getInstrOrderCost() {
   unsigned GoodOrder, BadOrder;
   BadOrder = GoodOrder = 0;
@@ -1247,8 +1291,7 @@ std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
 bool LoopInterchangeProfitability::isProfitable(
     const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
     unsigned OuterLoopId, CharMatrix &DepMatrix,
-    const DenseMap<const Loop *, unsigned> &CostMap,
-    std::unique_ptr<CacheCost> &CC) {
+    LoopInterchangeCacheCostManager &LICCM) {
   // isProfitable() is structured to avoid endless loop interchange. If the
   // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
   // decide the profitability then, profitability check will stop and return the
@@ -1261,9 +1304,12 @@ bool LoopInterchangeProfitability::isProfitable(
   std::optional<bool> shouldInterchange;
   for (RuleTy RT : Profitabilities) {
     switch (RT) {
-    case RuleTy::PerLoopCacheAnalysis:
+    case RuleTy::PerLoopCacheAnalysis: {
+      std::unique_ptr<CacheCost> &CC = LICCM.getCacheCost();
+      const DenseMap<const Loop *, unsigned> &CostMap = LICCM.getCostMap();
       shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
       break;
+    }
     case RuleTy::PerInstrOrderCost:
       shouldInterchange = isProfitablePerInstrOrderCost();
       break;
@@ -1841,10 +1887,7 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
   });
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
-  std::unique_ptr<CacheCost> CC =
-      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
-
-  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
+  if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
     return PreservedAnalyses::all();
   U.markLoopNestChanged(true);
   return getLoopPassPreservedAnalyses();

diff --git a/llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll b/llvm/test/Transforms/LoopInterchange/delay-cachecost-calculation.ll
@@ -0,0 +1,77 @@
+; REQUIRES: asserts
+
+; RUN: opt -passes=loop-interchange -debug -disable-output %s 2>&1 | FileCheck %s
+
+@A = global [16 x [16 x i32]] zeroinitializer
+
+; Check that the CacheCost is calculated only when required. In this case, it
+; is computed after passing the legality check.
+;
+; for (i = 0; i < 16; i++)
+;   for (j = 0; j < 16; j++)
+;     A[j][i] += 1;
+
+; CHECK: Loops are legal to interchange
+; CHECK: Compute CacheCost
+define void @legal_to_interchange() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
+  %idx = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
+  %val = load i32, ptr %idx
+  %inc = add i32 %val, 1
+  store i32 %inc, ptr %idx
+  %j.next = add i32 %j, 1
+  %j.exit = icmp eq i32 %j.next, 16
+  br i1 %j.exit, label %for.i.latch, label %for.j
+
+for.i.latch:
+  %i.next = add i32 %i, 1
+  %i.exit = icmp eq i32 %i.next, 16
+  br i1 %i.exit, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+; Check that the CacheCost is not calculated when not required. In this case,
+; the legality check always fails so that we do not need to compute the
+; CacheCost.
+;
+; for (i = 0; i < 16; i++)
+;   for (j = 0; j < 16; j++)
+;     A[j][i] = A[i][j];
+
+; CHECK-NOT: Compute CacheCost
+define void @illegal_to_interchange() {
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %i = phi i32 [ 0, %entry ], [ %i.next, %for.i.latch ]
+  br label %for.j
+
+for.j:
+  %j = phi i32 [ 0, %for.i.header ], [ %j.next, %for.j ]
+  %idx.load = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %i, i32 %j
+  %idx.store = getelementptr inbounds [16 x [16 x i32]], ptr @A, i32 %j, i32 %i
+  %val = load i32, ptr %idx.load
+  store i32 %val, ptr %idx.store
+  %j.next = add i32 %j, 1
+  %j.exit = icmp eq i32 %j.next, 16
+  br i1 %j.exit, label %for.i.latch, label %for.j
+
+for.i.latch:
+  %i.next = add i32 %i, 1
+  %i.exit = icmp eq i32 %i.next, 16
+  br i1 %i.exit, label %exit, label %for.i.header
+
+exit:
+  ret void
+}