llvm · kasuga-fj · Feb 17, 2025 · Mar 3, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -51,6 +51,18 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loop-interchange"
 
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopInterchangeFollowupAll =
+    "llvm.loop.interchange.followup_all";
+static const char *const LLVMLoopInterchangeFollowupNextOuter =
+    "llvm.loop.interchange.followup_next_outer";
+static const char *const LLVMLoopInterchangeFollowupOuter =
+    "llvm.loop.interchange.followup_outer";
+static const char *const LLVMLoopInterchangeFollowupInner =
+    "llvm.loop.interchange.followup_inner";
+/// @}
+
 STATISTIC(LoopsInterchanged, "Number of loops interchanged");
 
 static cl::opt<int> LoopInterchangeCostThreshold(
@@ -65,6 +77,14 @@ static cl::opt<unsigned int> MaxMemInstrCount(
         "in the dependency matrix. Higher value may lead to more interchanges "
         "at the cost of compile-time"));
 
+// Whether to apply by default.
+// TODO: Once this pass is enabled by default, remove this option and use the
+// value of PipelineTuningOptions.
+static cl::opt<bool> OnlyWhenForced(
+    "loop-interchange-only-when-forced", cl::init(false), cl::ReallyHidden,
+    cl::desc(
+        "Apply interchanges only when explicitly specified metadata exists"));
+
 namespace {
 
 using LoopVector = SmallVector<Loop *, 8>;
@@ -297,6 +317,16 @@ static bool isComputableLoopNest(ScalarEvolution *SE,
   return true;
 }
 
+static std::optional<bool> findMetadata(Loop *L) {
+  auto Value = findStringMetadataForLoop(L, "llvm.loop.interchange.enable");
+  if (!Value)
+    return std::nullopt;
+
+  const MDOperand *Op = *Value;
+  assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+  return mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+}
+
 namespace {
 
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -504,6 +534,12 @@ struct LoopInterchange {
         CostMap[LoopCosts[i].first] = i;
       }
     }
+
+    // If OnlyWhenForced is true, only process loops for which interchange is
+    // explicitly enabled.
+    if (OnlyWhenForced)
+      return processEnabledLoop(LoopList, DependencyMatrix, CostMap);
+
     // We try to achieve the globally optimal memory access for the loopnest,
     // and do interchange based on a bubble-sort fasion. We start from
     // the innermost loop, move it outwards to the best possible position
@@ -532,6 +568,10 @@ struct LoopInterchange {
     Loop *InnerLoop = LoopList[InnerLoopId];
     LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
                       << " and OuterLoopId = " << OuterLoopId << "\n");
+    if (findMetadata(OuterLoop) == false || findMetadata(InnerLoop) == false) {
+      LLVM_DEBUG(dbgs() << "Not interchanging loops. It is disabled.\n");
+      return false;
+    }
     LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
     if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
       LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
@@ -569,6 +609,152 @@ struct LoopInterchange {
 
     return true;
   }
+
+  bool processEnabledLoop(SmallVectorImpl<Loop *> &LoopList,
+                          std::vector<std::vector<char>> &DependencyMatrix,
+                          const DenseMap<const Loop *, unsigned> &CostMap) {
+    bool Changed = false;
+
+    // Manage the index so that LoopList[Loop2Index[L]] == L for each loop L.
+    DenseMap<Loop *, unsigned> Loop2Index;
+    for (unsigned I = 0; I != LoopList.size(); I++)
+      Loop2Index[LoopList[I]] = I;
+
+    // Hold outer loops to be exchanged (i.e., loops that have
+    // "llvm.loop.interchange.enable" is true), in the current nest order.
+    SmallVector<Loop *, 4> Worklist;
+
+    // Helper funciton to try to add a new loop into the Worklist. Return false
+    // if there is a duplicate in the loop to be interchanged.
+    auto AddLoopIfEnabled = [&](Loop *L) {
+      if (findMetadata(L) == true) {
+        if (!Worklist.empty()) {
+          // Because the loops are sorted in the order of the current nest, it
+          // is sufficient to compare with the last element.
+          unsigned InnerLoopId = Loop2Index[Worklist.back()] + 1;
+          unsigned OuterLoopId = Loop2Index[L];
+          if (OuterLoopId <= InnerLoopId) {
+            ORE->emit([&]() {
+              return OptimizationRemarkMissed(DEBUG_TYPE, "AmbiguousOrder",
+                                              L->getStartLoc(), L->getHeader())
+                     << "The loops to be interchanged are overlapping.";
+            });
+            return false;
+          }
+        }
+        Worklist.push_back(L);
+      }
+      return true;
+    };
+
+    // Initialize Worklist. To process the loops in inner-loop-first order, add
+    // them to the worklist in the outer-loop-first order.
+    for (unsigned I = 0; I != LoopList.size(); I++)
+      if (!AddLoopIfEnabled(LoopList[I]))
+        return Changed;
+
+    // Set an upper bound of the number of transformations to avoid infinite
+    // loop. There is no deep meaning behind the current value (square of the
+    // size of LoopList).
+    // TODO: Is this really necessary?
+    const unsigned MaxAttemptsCount = LoopList.size() * LoopList.size();
+    unsigned Attempts = 0;
+
+    // Process the loops. An exchange is applied to two loops, but a metadata
+    // replacement can be applied to three loops: the two loops plus the next
+    // outer loop, if it exists. This is because it's necessary to express the
+    // information about the order of the application of interchanges in cases
+    // where the target loops to be exchanged are overlapping, e.g.,
+    //
+    // #pragma clang loop interchange(enable)
+    // for(int i=0;i<N;i++)
+    //   #pragma clang loop interchange(enable)
+    //   for (int j=0;j<N;j++)
+    //     for (int k=0;k<N;k++)
+    //       ...
+    //
+    // In this case we will exchange the innermost two loops at first, the
+    // follow-up metadata including enabling interchange is attached on the
+    // outermost loop, and it is enqueued as the next candidate to be processed.
+    while (!Worklist.empty() && Attempts < MaxAttemptsCount) {
+      Loop *TargetLoop = Worklist.pop_back_val();
+      assert(findMetadata(TargetLoop) == true &&
+             "Some metadata was unexpectedlly removed");
+      unsigned OuterLoopId = Loop2Index[TargetLoop];
+      unsigned InnerLoopId = OuterLoopId + 1;
+      if (InnerLoopId >= LoopList.size()) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "InnermostLoop",
+                                          TargetLoop->getStartLoc(),
+                                          TargetLoop->getHeader())
+                 << "The metadata is invalid with an innermost loop.";
+        });
+        break;
+      }
+      MDNode *LoopID = TargetLoop->getLoopID();
+      bool Interchanged = processLoop(LoopList, InnerLoopId, OuterLoopId,
+                                      DependencyMatrix, CostMap);
+      if (!Interchanged) {
+        ORE->emit([&]() {
+          return OptimizationRemarkMissed(DEBUG_TYPE, "NotInterchanged",
+                                          TargetLoop->getStartLoc(),
+                                          TargetLoop->getHeader())
+                 << "Failed to perform explicitly specified loop interchange.";
+        });
+        break;
+      }
+
+      // The next outer loop, or nullptr if TargetLoop is the outermost one.
+      Loop *NextOuterLoop = nullptr;
+      if (0 < OuterLoopId)
+        NextOuterLoop = LoopList[OuterLoopId - 1];
+      Loop *OuterLoop = LoopList[OuterLoopId];
+      Loop *InnerLoop = LoopList[InnerLoopId];
+      Attempts++;
+      Changed = true;
+      Loop2Index[OuterLoop] = OuterLoopId;
+      Loop2Index[InnerLoop] = InnerLoopId;
+
+      // Update the metadata.
+      std::optional<MDNode *> MDNextOuterLoopID =
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupNextOuter});
+      std::optional<MDNode *> MDOuterLoopID =
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupOuter});
+      std::optional<MDNode *> MDInnerLoopID =
+          makeFollowupLoopID(LoopID, {LLVMLoopInterchangeFollowupAll,
+                                      LLVMLoopInterchangeFollowupInner});
+      if (MDNextOuterLoopID) {
+        if (NextOuterLoop) {
+          NextOuterLoop->setLoopID(*MDNextOuterLoopID);
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "New metadata for the next outer loop is ignored.\n");
+        }
+      }
+      if (MDOuterLoopID)
+        OuterLoop->setLoopID(*MDOuterLoopID);
+      if (MDInnerLoopID)
+        InnerLoop->setLoopID(*MDInnerLoopID);
+
+      // Add new elements, paying attention to the order.
+      bool Valid = true;
+      if (NextOuterLoop)
+        Valid &= AddLoopIfEnabled(NextOuterLoop);
+      Valid &= AddLoopIfEnabled(OuterLoop);
+      Valid &= AddLoopIfEnabled(InnerLoop);
+      if (!Valid)
+        break;
+    }
+
+    LLVM_DEBUG({
+      if (!Worklist.empty())
+        dbgs() << "Some metadata was ignored because the maximum number of "
+                  "attempts was reached.\n";
+    });
+    return Changed;
+  }
 };
 
 } // end anonymous namespace

diff --git a/llvm/test/Transforms/LoopInterchange/metadata-disable.ll b/llvm/test/Transforms/LoopInterchange/metadata-disable.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-interchange -loop-interchange-only-when-forced=0 --cache-line-size=64 -S < %s | FileCheck %s
+
+; Check that the interchange is not applied to the loop that is disabled by
+; metadata. The original code is as below:
+;
+; for (int i=0; i<128; i++)
+;   for (int j=0; j<128; j++)
+;    #pragma clang loop interchange(disable)
+;     for (int k=0; k<128; k++)
+;       for (int l=0; l<128; l++)
+;         a[l][k][j][i]++;
+;
+; Since interchanges are not be applied to the k-loop, the pair (i, j) is the
+; only candidate for exchange.
+
+@a = dso_local local_unnamed_addr global [128 x [128 x [128 x [128 x i32]]]] zeroinitializer, align 4
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[FOR_J_HEADER_PREHEADER:.*]]
+; CHECK:       [[FOR_I_HEADER_PREHEADER:.*]]:
+; CHECK-NEXT:    br label %[[FOR_I_HEADER:.*]]
+; CHECK:       [[FOR_I_HEADER]]:
+; CHECK-NEXT:    [[IV_I:%.*]] = phi i64 [ [[IV_I_NEXT:%.*]], %[[FOR_I_CLEANUP:.*]] ], [ 0, %[[FOR_I_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_K_HEADER:.*]]
+; CHECK:       [[FOR_J_HEADER_PREHEADER]]:
+; CHECK-NEXT:    br label %[[FOR_J_HEADER:.*]]
+; CHECK:       [[FOR_J_HEADER]]:
+; CHECK-NEXT:    [[IV_J:%.*]] = phi i64 [ [[IV_J_NEXT:%.*]], %[[FOR_J_CLEANUP:.*]] ], [ 0, %[[FOR_J_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label %[[FOR_I_HEADER_PREHEADER]]
+; CHECK:       [[FOR_K_HEADER]]:
+; CHECK-NEXT:    [[IV_K:%.*]] = phi i64 [ 0, %[[FOR_I_HEADER]] ], [ [[IV_K_NEXT:%.*]], %[[FOR_K_CLEANUP:.*]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV_L:%.*]] = phi i64 [ 0, %[[FOR_K_HEADER]] ], [ [[TMP0:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 [[IV_L]], i64 [[IV_K]], i64 [[IV_J]], i64 [[IV_I]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[VAL]], 1
+; CHECK-NEXT:    store i32 [[INC]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP0]] = add nuw nsw i64 [[IV_L]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 128
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FOR_K_CLEANUP]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_K_CLEANUP]]:
+; CHECK-NEXT:    [[IV_K_NEXT]] = add nuw nsw i64 [[IV_K]], 1
+; CHECK-NEXT:    [[EXITCOND_K:%.*]] = icmp eq i64 [[IV_K_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_K]], label %[[FOR_I_CLEANUP]], label %[[FOR_K_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[FOR_J_CLEANUP]]:
+; CHECK-NEXT:    [[IV_J_NEXT]] = add nuw nsw i64 [[IV_J]], 1
+; CHECK-NEXT:    [[EXITCOND_J:%.*]] = icmp eq i64 [[IV_J_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_J]], label %[[EXIT:.*]], label %[[FOR_J_HEADER]]
+; CHECK:       [[FOR_I_CLEANUP]]:
+; CHECK-NEXT:    [[IV_I_NEXT]] = add nuw nsw i64 [[IV_I]], 1
+; CHECK-NEXT:    [[EXITCOND_I:%.*]] = icmp eq i64 [[IV_I_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND_I]], label %[[FOR_J_CLEANUP]], label %[[FOR_I_HEADER]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.i.header
+
+for.i.header:
+  %iv.i = phi i64 [ 0, %entry ], [ %iv.i.next, %for.i.cleanup ]
+  br label %for.j.header
+
+for.j.header:
+  %iv.j = phi i64 [ 0, %for.i.header ], [ %iv.j.next, %for.j.cleanup ]
+  br label %for.k.header
+
+for.k.header:
+  %iv.k = phi i64 [ 0, %for.j.header ], [ %iv.k.next, %for.k.cleanup ]
+  br label %for.body
+
+for.body:
+  %iv.l = phi i64 [ 0, %for.k.header ], [ %iv.l.next, %for.body ]
+  %ptr = getelementptr inbounds nuw [128 x [128 x [128 x [128 x i32]]]], ptr @a, i64 %iv.l, i64 %iv.k, i64 %iv.j, i64 %iv.i
+  %val = load i32, ptr %ptr, align 4
+  %inc = add nuw nsw i32 %val, 1
+  store i32 %inc, ptr %ptr, align 4
+  %iv.l.next = add nuw nsw i64 %iv.l, 1
+  %exitcond.l = icmp eq i64 %iv.l.next, 128
+  br i1 %exitcond.l, label %for.k.cleanup, label %for.body
+
+for.k.cleanup:
+  %iv.k.next = add nuw nsw i64 %iv.k, 1
+  %exitcond.k = icmp eq i64 %iv.k.next, 128
+  br i1 %exitcond.k, label %for.j.cleanup, label %for.k.header, !llvm.loop !0
+
+for.j.cleanup:
+  %iv.j.next = add nuw nsw i64 %iv.j, 1
+  %exitcond.j = icmp eq i64 %iv.j.next, 128
+  br i1 %exitcond.j, label %for.i.cleanup, label %for.j.header
+
+for.i.cleanup:
+  %iv.i.next = add nuw nsw i64 %iv.i, 1
+  %exitcond.i = icmp eq i64 %iv.i.next, 128
+  br i1 %exitcond.i, label %exit, label %for.i.header
+
+exit:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.interchange.enable", i1 false}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.interchange.enable", i1 false}
+;.