From 0e41f800886c656ae72eb9e2ad243de4b31b604c Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro@fujitsu.com>
Date: Wed, 5 Feb 2025 13:01:08 +0000
Subject: [PATCH] [LoopInterchange] Avoid using CacheCost if cache line size is
 zero

Profitability decisions with `CacheCost` sometimes gave strange results
when the cache line size was zero. This patch prevents `CacheCost` from
being used when the cache line size is zero, because it doesn't make
sense. This patch also prevents the `CacheCost` from being calculated in
this case, which may reduce compilation time.
---
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 14 ++++-
 .../LoopInterchange/cache-line-size-zero.ll   | 59 +++++++++++++++++++
 2 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopInterchange/cache-line-size-zero.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index d88fdf41db7a8..adefad9285e42 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1130,6 +1130,12 @@ std::optional<bool>
 LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
     const DenseMap<const Loop *, unsigned> &CostMap,
     std::unique_ptr<CacheCost> &CC) {
+  // The `CacheCost` is not calculated if it is not considered worthwhile to use
+  // it. In this case we leave the profitability decision to the subsequent
+  // processes.
+  if (CC == nullptr)
+    return std::nullopt;
+
   // This is the new cost model returned from loop cache analysis.
   // A smaller index means the loop should be placed an outer loop, and vice
   // versa.
@@ -1773,8 +1779,12 @@ PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
   });
 
   DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
-  std::unique_ptr<CacheCost> CC =
-      CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
+
+  std::unique_ptr<CacheCost> CC;
+  // If the cache line size is set to zero, it doesn't make sense to use
+  // `CacheCost` for profitability decisions. Avoid computing it in this case.
+  if (AR.TTI.getCacheLineSize() != 0)
+    CC = CacheCost::getCacheCost(LN.getOutermostLoop(), AR, DI);
 
   if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
     return PreservedAnalyses::all();
diff --git a/llvm/test/Transforms/LoopInterchange/cache-line-size-zero.ll b/llvm/test/Transforms/LoopInterchange/cache-line-size-zero.ll
new file mode 100644
index 0000000000000..bce47bce52325
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/cache-line-size-zero.ll
@@ -0,0 +1,59 @@
+; RUN: opt %s -passes=loop-interchange -cache-line-size=0 -pass-remarks-output=%t -verify-dom-info -verify-loop-info \
+; RUN:     -pass-remarks=loop-interchange -pass-remarks-missed=loop-interchange -disable-output
+; RUN: FileCheck -input-file %t %s
+
+;; In the following code, interchanging is unprofitable even if the cache line
+;; size is set to zero. There are cases where the default cache line size is
+;; zero, e.g., the target processor is not specified.
+;;
+;; #define N 100
+;; #define M 100
+;; 
+;; // Extracted from SingleSource/Benchmarks/Polybench/datamining/correlation/correlation.c
+;; // in llvm-test-suite
+;; void f(double data[N][M], double mean[M], double stddev[M]) {
+;;   for (int i = 0; i < N; i++) {
+;;     for (int j = 0; j < M; j++) {
+;;       data[i][j] -= mean[j];
+;;       data[i][j] /= stddev[j];
+;;     }
+;;   }
+;; }
+
+; CHECK:      --- !Missed
+; CHECK-NEXT: Pass:            loop-interchange
+; CHECK:      Name:            InterchangeNotProfitable
+; CHECK-NEXT: Function:        f
+
+define void @f(ptr noundef captures(none) %data, ptr noundef readonly captures(none) %mean, ptr noundef readonly captures(none) %stddev) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %indvars.iv30 = phi i64 [ 0, %entry ], [ %indvars.iv.next31, %for.cond.cleanup3 ]
+  br label %for.body4
+
+for.cond.cleanup:
+  ret void
+
+for.cond.cleanup3:
+  %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1
+  %exitcond33 = icmp ne i64 %indvars.iv.next31, 100
+  br i1 %exitcond33, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.body4:
+  %indvars.iv = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %arrayidx = getelementptr inbounds nuw double, ptr %mean, i64 %indvars.iv
+  %0 = load double, ptr %arrayidx, align 8
+  %arrayidx8 = getelementptr inbounds nuw [100 x double], ptr %data, i64 %indvars.iv30, i64 %indvars.iv
+  %1 = load double, ptr %arrayidx8, align 8
+  %sub = fsub double %1, %0
+  store double %sub, ptr %arrayidx8, align 8
+  %arrayidx10 = getelementptr inbounds nuw double, ptr %stddev, i64 %indvars.iv
+  %2 = load double, ptr %arrayidx10, align 8
+  %div = fdiv double %sub, %2
+  store double %div, ptr %arrayidx8, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, 100
+  br i1 %exitcond, label %for.body4, label %for.cond.cleanup3
+}