From 23b3a7fe966996aa5c6dd9bb3b9e18c840a33075 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Mon, 17 Feb 2025 16:58:22 +0800
Subject: [PATCH 1/8] support unroll by the gpu.launchOp.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.h    |   4 +
 mlir/include/mlir/Dialect/Affine/LoopUtils.h  |   3 +
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |   6 +
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  | 110 +++++++++++++++---
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   |  57 +++++++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  20 ++++
 mlir/test/Dialect/Affine/unroll.mlir          | 110 ++++++++++++++++++
 7 files changed, 285 insertions(+), 25 deletions(-)
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index ed3c21d952a01..2bd540b9af2eb 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// In the GPU, the number of trip of each thread in the loop is inconsistent.
+/// This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 7fe1f6d48ceeb..1d1d6d94d2382 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
 /// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 
+/// Eliminate loops that will never actually execute.
+LogicalResult removeInvalidLoop(AffineForOp forOp);
+
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
 void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 2b1ce573effd0..940d47c5ef2c8 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Find BlockSize via the BlockArgument of gpu.launch.
+    Value getBlockSizeOnAxis(Value threadId);
+
+    ///  Find BlockSize via the Dimension Information.
+    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 0d4b0ea1668e0..15a5376fa922e 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+                               SmallVectorImpl<Value> &operands,
+                               SmallVectorImpl<AffineExpr> &symReplacements,
+                               unsigned numDim, bool replaceWithZero = false) {
+  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+  if (!launchOp)
+    return;
+
+  // `b` is only used to create `AffineExpr`.
+  Builder b(forOp.getContext());
+  unsigned idx = 0;
+
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+      operands[i] = blockSize;
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) {
+      ++idx;
+      continue;
+    }
+
+    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+      gpu::Dimension dimension = threadIdOp.getDimension();
+      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+    ++idx;
+  }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a08949..69ceb0f80095b 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
 }
 
+/// Eliminate loops that will never actually execute
+LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
+  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
+    return failure();
+
+  auto iterOperands = forOp.getInits();
+  auto results = forOp.getResults();
+  for (auto [result, operand] : llvm::zip(results, iterOperands))
+    result.replaceAllUsesWith(operand);
+
+  IRRewriter b(forOp);
+  b.eraseOp(forOp);
+  return success();
+}
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +904,27 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // The values of Trip are all 0, and the invalid loop is deleted.
+  if (tripCount <= 0 && maxTripCount <= 0)
+    return removeInvalidLoop(forOp);
+
+  // In special cases, such as in a GPU, only some threads execute this loop.
+  if (tripCount == 0 && maxTripCount == 1)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d06f10d3137a1..31051ed7e55a2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+  if (dimension == Dimension::x)
+    return getBlockSizeX();
+  else if (dimension == Dimension::y)
+    return getBlockSizeY();
+  else
+    return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+  KernelDim3 threadIds = getThreadIds();
+  if (threadIds.x == threadId)
+    return getBlockSizeX();
+  else if (threadIds.y == threadId)
+    return getBlockSizeY();
+  else if (threadIds.z == threadId)
+    return getBlockSizeZ();
+  return {};
+}
+
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..a2bb0b2cac4e3 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,89 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @invalid_loop
+func.func @invalid_loop() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+    // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id  x
+    // UNROLL-FULL-CHECK: gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL:   func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL:           %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0

From c834f4d70494daa5abac945447b6d307d3900bac Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Sat, 22 Feb 2025 17:53:19 +0800
Subject: [PATCH 2/8] delete the feature of remove invalid loops.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.h    |  4 +-
 mlir/include/mlir/Dialect/Affine/LoopUtils.h  |  3 -
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  2 +-
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   | 25 +------
 mlir/test/Dialect/Affine/unroll.mlir          | 68 +++++++------------
 5 files changed, 30 insertions(+), 72 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 2bd540b9af2eb..591533d17c960 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,8 +43,8 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
-/// In the GPU, the number of trip of each thread in the loop is inconsistent.
-/// This function returns the maximum number of trip.
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
 std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
 
 /// Returns the greatest known integral divisor of the trip count. Affine
diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
index 1d1d6d94d2382..7fe1f6d48ceeb 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h
@@ -86,9 +86,6 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
 /// was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
 
-/// Eliminate loops that will never actually execute.
-LogicalResult removeInvalidLoop(AffineForOp forOp);
-
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
 void promoteSingleIterationLoops(func::FuncOp f);
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 940d47c5ef2c8..fde1ad482ae2d 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1039,7 +1039,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     /// Find BlockSize via the BlockArgument of gpu.launch.
     Value getBlockSizeOnAxis(Value threadId);
 
-    ///  Find BlockSize via the Dimension Information.
+    /// Find BlockSize via the Dimension Information.
     Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 69ceb0f80095b..b6471ac179b22 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -114,23 +114,6 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
     std::get<0>(e).replaceAllUsesWith(std::get<1>(e));
 }
 
-/// Eliminate loops that will never actually execute
-LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) {
-  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
-  if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0)
-    return failure();
-
-  auto iterOperands = forOp.getInits();
-  auto results = forOp.getResults();
-  for (auto [result, operand] : llvm::zip(results, iterOperands))
-    result.replaceAllUsesWith(operand);
-
-  IRRewriter b(forOp);
-  b.eraseOp(forOp);
-  return success();
-}
-
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
@@ -914,12 +897,8 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   uint64_t tripCount = *mayBeConstantTripCount;
   uint64_t maxTripCount = *maxMayBeConstantTripCount;
 
-  // The values of Trip are all 0, and the invalid loop is deleted.
-  if (tripCount <= 0 && maxTripCount <= 0)
-    return removeInvalidLoop(forOp);
-
-  // In special cases, such as in a GPU, only some threads execute this loop.
-  if (tripCount == 0 && maxTripCount == 1)
+  // Trip equals 0, this loop cannot unroll.
+  if (tripCount <= 0)
     return success();
 
   if (tripCount == 1 && maxTripCount == 1)
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index a2bb0b2cac4e3..ab73c5ac7e9c4 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -270,38 +270,20 @@ func.func @thread_partial_execution() {
       %3 = arith.addi %arg, %0 : index
       affine.yield %3 : index
     }
-    // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
-    // UNROLL-FULL:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
-    // UNROLL-FULL:   affine.yield %[[SUM]] : index
-    // UNROLL-FULL: }
+    // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL-NEXT: }
     gpu.terminator
   }
   return
 }
 
-// UNROLL-FULL-LABEL: func @invalid_loop
-func.func @invalid_loop() {
-  %0 = arith.constant 0 :index
-  %1 = arith.constant 2 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
-    %threadid = gpu.thread_id x
-    affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index {
-      %3 = arith.addi %arg, %0 : index
-      affine.yield %3 : index
-    }
-    gpu.terminator
-    // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id  x
-    // UNROLL-FULL-CHECK: gpu.terminator
-  }
-  return
-}
-
 // UNROLL-FULL-LABEL: func @unroll_all_thread
 func.func @unroll_all_thread() {
   %0 = arith.constant 0 :index
   %1 = arith.constant 2 : index
-  // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
   gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
              threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
     %threadid = gpu.thread_id x
@@ -309,19 +291,19 @@ func.func @unroll_all_thread() {
       %3 = arith.addi %arg, %0 : index
       affine.yield %3 : index
     }
-    // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
-    // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
-    // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
     gpu.terminator
   }
   return
 }
 
-// UNROLL-FULL-LABEL:   func.func @partial_unroll_factor_4
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
 func.func @partial_unroll_factor_4() {
   %0 = arith.constant 0 :index
   %1 = arith.constant 2 : index
-  // UNROLL-FULL:           %[[C0:.*]] = arith.constant 0 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
   gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
              threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
     %threadid = gpu.thread_id x
@@ -332,13 +314,13 @@ func.func @partial_unroll_factor_4() {
     gpu.terminator
   }
   // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
-  // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
-  // UNROLL-FULL:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
-  // UNROLL-FULL:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
-  // UNROLL-FULL:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
-  // UNROLL-FULL:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-FULL:   affine.yield %[[SUM_3]] : index
-  // UNROLL-FULL: }
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL-NEXT: }
   return
 }
 
@@ -800,14 +782,14 @@ func.func @gpu_launch_unroll_by_factor_4() {
     gpu.terminator
   }
   // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
-  // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
-  // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
-  // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
-  // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
-  // UNROLL-BY-4:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
-  // UNROLL-BY-4:   affine.yield %[[SUM_4]] : index
-  // UNROLL-BY-4: }
+  // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4-NEXT: }
   return
 }
 

From e865351424ee36285133ee14ceccd924ea21dda3 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Wed, 26 Feb 2025 10:55:07 +0800
Subject: [PATCH 3/8] use IntegerRangeAnalysis and update
 launchOp::inferResultRanges.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.h    |  7 +-
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  6 --
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  | 90 ++++++++++---------
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   |  7 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 20 -----
 .../GPU/IR/InferIntRangeInterfaceImpls.cpp    | 34 ++++---
 6 files changed, 74 insertions(+), 90 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
index 591533d17c960..f5b6794d42794 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,9 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
-/// In some scenarios, such as GPU, the number of trip of each thread in the
-/// loop is inconsistent. This function returns the maximum number of trip.
-std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);
 
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index fde1ad482ae2d..2b1ce573effd0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,12 +1035,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
-
-    /// Find BlockSize via the BlockArgument of gpu.launch.
-    Value getBlockSizeOnAxis(Value threadId);
-
-    /// Find BlockSize via the Dimension Information.
-    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;
diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 15a5376fa922e..5ed11d8bde029 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -12,13 +12,15 @@
 
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -31,6 +33,7 @@
 
 using namespace mlir;
 using namespace mlir::affine;
+using namespace mlir::dataflow;
 
 #define DEBUG_TYPE "affine-loop-analysis"
 
@@ -85,48 +88,54 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
-/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
-/// thread_id will be replaced by its minimum value 0.
-static void replaceGPUOperands(AffineForOp forOp,
-                               SmallVectorImpl<Value> &operands,
-                               SmallVectorImpl<AffineExpr> &symReplacements,
-                               unsigned numDim, bool replaceWithZero = false) {
-  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
-  if (!launchOp)
+/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
+/// the `symReplacements` with range. If `replaceByMin` is set to true,
+/// construct `replacement` using the smallest value.By default, the largest
+/// value will be used for constructing `replacement`.
+static void replaceOperandByRange(AffineForOp forOp,
+                                  SmallVectorImpl<Value> &operands,
+                                  SmallVectorImpl<AffineExpr> &symReplacements,
+                                  unsigned numDim, bool replaceByMin = false) {
+  DataFlowSolver solver;
+  solver.load<DeadCodeAnalysis>();
+  solver.load<IntegerRangeAnalysis>();
+  if (failed(solver.initializeAndRun(
+          forOp->getParentOfType<FunctionOpInterface>())))
     return;
 
-  // `b` is only used to create `AffineExpr`.
+  // `b` is used to create affineExpr
   Builder b(forOp.getContext());
-  unsigned idx = 0;
-
   for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
     Value operand = operands[i];
-    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
-      operands[i] = blockSize;
-      if (!replaceWithZero)
-        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
-      else
-        symReplacements.push_back(b.getAffineConstantExpr(0));
+    auto lattice =
+        solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
+    if (!lattice) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
       continue;
     }
 
-    Operation *defOp = operand.getDefiningOp();
-    if (!defOp) {
-      ++idx;
+    if (lattice->getValue().isUninitialized()) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
       continue;
     }
 
-    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
-      gpu::Dimension dimension = threadIdOp.getDimension();
-      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
-      if (!replaceWithZero)
-        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
-      else
-        symReplacements.push_back(b.getAffineConstantExpr(0));
+    ConstantIntRanges range = lattice->getValue().getValue();
+    APInt max = range.smax();
+    APInt min = range.smin();
+    unsigned bitNums = max.getBitWidth();
+
+    if (APInt::getSignedMaxValue(bitNums) == max &&
+        APInt::getSignedMinValue(bitNums) == min) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
       continue;
     }
-    ++idx;
+
+    if (!replaceByMin)
+      symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
+    else
+      symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
   }
+  return;
 }
 
 /// Take the min if all trip counts are constant.
@@ -158,19 +167,17 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
   if (!map)
     return std::nullopt;
   SmallVector<AffineExpr, 4> symReplacements;
-  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
   map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
                                   map.getNumSymbols());
-  affine::AffineValueMap valueMap(map, operands);
-  (void)valueMap.canonicalize();
-  map = valueMap.getAffineMap();
   return getConstantTripCountFromAffineMap(map);
 }
 
-/// In some scenarios, such as GPU, the number of trip of each thread in the
-/// loop is inconsistent. This function returns the maximum number of trip.
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
 std::optional<uint64_t>
-mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
   SmallVector<Value, 4> operands;
   AffineMap map;
   getTripCountMapAndOperands(forOp, &map, &operands);
@@ -178,12 +185,10 @@ mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
   if (!map)
     return std::nullopt;
   SmallVector<AffineExpr, 4> symReplacements;
-  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
+                        true);
   map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
                                   map.getNumSymbols());
-  affine::AffineValueMap valueMap(map, operands);
-  (void)valueMap.canonicalize();
-  map = valueMap.getAffineMap();
   return getConstantTripCountFromAffineMap(map);
 }
 
@@ -198,12 +203,9 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
   if (!map)
     return 1;
   SmallVector<AffineExpr, 4> symReplacements;
-  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
   map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
                                   map.getNumSymbols());
-  affine::AffineValueMap valueMap(map, operands);
-  (void)valueMap.canonicalize();
-  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index b6471ac179b22..a344bc8f9bffe 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,7 +17,6 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -118,7 +117,7 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
   if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
@@ -888,7 +887,7 @@ void mlir::affine::getTileableBands(
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   std::optional<uint64_t> maxMayBeConstantTripCount =
-      getMaxConstantTripCount(forOp);
+      getUpperBoundOnTripCount(forOp);
 
   if (!mayBeConstantTripCount.has_value() &&
       !maxMayBeConstantTripCount.has_value())
@@ -1025,7 +1024,7 @@ LogicalResult mlir::affine::loopUnrollByFactor(
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
   std::optional<uint64_t> maxMayBeConstantTripCount =
-      getMaxConstantTripCount(forOp);
+      getUpperBoundOnTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
         maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 31051ed7e55a2..d06f10d3137a1 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,26 +799,6 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
-Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
-  if (dimension == Dimension::x)
-    return getBlockSizeX();
-  else if (dimension == Dimension::y)
-    return getBlockSizeY();
-  else
-    return getBlockSizeZ();
-}
-
-Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
-  KernelDim3 threadIds = getThreadIds();
-  if (threadIds.x == threadId)
-    return getBlockSizeX();
-  else if (threadIds.y == threadId)
-    return getBlockSizeY();
-  else if (threadIds.z == threadId)
-    return getBlockSizeZ();
-  return {};
-}
-
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index f5e30a278f06b..f62d01d719633 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -250,26 +250,34 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                  SetIntRangeFn setResultRange) {
   auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
-                      Value idxResult) {
+                      Value idxResult, Value size) {
     if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
       return;
-    ConstantIntRanges dimRange =
-        argRange.intersection(getIndexRange(1, kMaxDim));
-    setResultRange(dimResult, dimRange);
-    ConstantIntRanges idxRange =
-        getIndexRange(0, dimRange.umax().getZExtValue() - 1);
-    setResultRange(idxResult, idxRange);
+    APInt sizeInt;
+    if (matchPattern(size, m_ConstantInt(&sizeInt))) {
+      ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
+      setResultRange(dimResult, dimRange);
+      ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
+      setResultRange(idxResult, idxRange);
+    } else {
+      ConstantIntRanges dimRange =
+          argRange.intersection(getIndexRange(1, kMaxDim));
+      setResultRange(dimResult, dimRange);
+      ConstantIntRanges idxRange =
+          getIndexRange(0, dimRange.umax().getZExtValue() - 1);
+      setResultRange(idxResult, idxRange);
+    }
   };
 
   argRanges = argRanges.drop_front(getAsyncDependencies().size());
   KernelDim3 gridDims = getGridSize();
   KernelDim3 blockIds = getBlockIds();
-  setRange(argRanges[0], gridDims.x, blockIds.x);
-  setRange(argRanges[1], gridDims.y, blockIds.y);
-  setRange(argRanges[2], gridDims.z, blockIds.z);
+  setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
+  setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
+  setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
   KernelDim3 blockDims = getBlockSize();
   KernelDim3 threadIds = getThreadIds();
-  setRange(argRanges[3], blockDims.x, threadIds.x);
-  setRange(argRanges[4], blockDims.y, threadIds.y);
-  setRange(argRanges[5], blockDims.z, threadIds.z);
+  setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
+  setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
+  setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
 }

From 0b30c4e9d9747dd1040280a471df17389eed00cb Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Fri, 28 Feb 2025 20:45:38 +0800
Subject: [PATCH 4/8] use ValueBoundsOpInterface.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  | 117 ++++++------------
 .../GPU/IR/InferIntRangeInterfaceImpls.cpp    |  34 ++---
 .../lib/Interfaces/ValueBoundsOpInterface.cpp |   3 +-
 3 files changed, 54 insertions(+), 100 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 5ed11d8bde029..bcb31db6b1a93 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -12,8 +12,6 @@
 
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 
-#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
-#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
@@ -21,6 +19,7 @@
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -33,7 +32,6 @@
 
 using namespace mlir;
 using namespace mlir::affine;
-using namespace mlir::dataflow;
 
 #define DEBUG_TYPE "affine-loop-analysis"
 
@@ -88,69 +86,37 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
-/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
-/// the `symReplacements` with range. If `replaceByMin` is set to true,
-/// construct `replacement` using the smallest value.By default, the largest
-/// value will be used for constructing `replacement`.
-static void replaceOperandByRange(AffineForOp forOp,
-                                  SmallVectorImpl<Value> &operands,
-                                  SmallVectorImpl<AffineExpr> &symReplacements,
-                                  unsigned numDim, bool replaceByMin = false) {
-  DataFlowSolver solver;
-  solver.load<DeadCodeAnalysis>();
-  solver.load<IntegerRangeAnalysis>();
-  if (failed(solver.initializeAndRun(
-          forOp->getParentOfType<FunctionOpInterface>())))
-    return;
-
-  // `b` is used to create affineExpr
-  Builder b(forOp.getContext());
-  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
-    Value operand = operands[i];
-    auto lattice =
-        solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
-    if (!lattice) {
-      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
-      continue;
-    }
-
-    if (lattice->getValue().isUninitialized()) {
-      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
-      continue;
-    }
-
-    ConstantIntRanges range = lattice->getValue().getValue();
-    APInt max = range.smax();
-    APInt min = range.smin();
-    unsigned bitNums = max.getBitWidth();
-
-    if (APInt::getSignedMaxValue(bitNums) == max &&
-        APInt::getSignedMinValue(bitNums) == min) {
-      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
-      continue;
-    }
-
-    if (!replaceByMin)
-      symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
-    else
-      symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
-  }
-  return;
-}
-
 /// Take the min if all trip counts are constant.
 static std::optional<uint64_t>
-getConstantTripCountFromAffineMap(AffineMap map) {
+getConstantTripCountFromAffineMap(AffineMap map,
+                                  SmallVectorImpl<Value> &operands,
+                                  presburger::BoundType type) {
   std::optional<uint64_t> tripCount;
   for (auto resultExpr : map.getResults()) {
-    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
-    if (!constExpr)
+    AffineMap subMap =
+        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    ValueBoundsConstraintSet::Variable var(subMap, operands);
+    auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::LB, var);
+    auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::UB, var, nullptr, true);
+    if (failed(lbBound) || failed(ubBound))
       return std::nullopt;
-    if (tripCount.has_value())
-      tripCount =
-          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-    else
-      tripCount = constExpr.getValue();
+    if (type == presburger::BoundType::LB) {
+      if (tripCount.has_value())
+        tripCount =
+            std::min(*tripCount, static_cast<uint64_t>(lbBound.value()));
+      else
+        tripCount = lbBound.value();
+    } else if (type == presburger::BoundType::UB) {
+      if (tripCount.has_value())
+        tripCount =
+            std::min(*tripCount, static_cast<uint64_t>(ubBound.value()));
+      else
+        tripCount = ubBound.value();
+    } else {
+      return std::nullopt;
+    }
   }
   return tripCount;
 }
@@ -166,11 +132,8 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
-  SmallVector<AffineExpr, 4> symReplacements;
-  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
-  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
-                                  map.getNumSymbols());
-  return getConstantTripCountFromAffineMap(map);
+  return getConstantTripCountFromAffineMap(map, operands,
+                                           presburger::BoundType::LB);
 }
 
 /// Returns the maximum trip count when the operand of forOp has a range. If the
@@ -184,12 +147,8 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
-  SmallVector<AffineExpr, 4> symReplacements;
-  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
-                        true);
-  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
-                                  map.getNumSymbols());
-  return getConstantTripCountFromAffineMap(map);
+  return getConstantTripCountFromAffineMap(map, operands,
+                                           presburger::BoundType::UB);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -202,18 +161,20 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-  SmallVector<AffineExpr, 4> symReplacements;
-  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
-  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
-                                  map.getNumSymbols());
+
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
   std::optional<uint64_t> gcd;
   for (auto resultExpr : map.getResults()) {
     uint64_t thisGcd;
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      uint64_t tripCount = constExpr.getValue();
+    AffineMap subMap =
+        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    ValueBoundsConstraintSet::Variable var(subMap, operands);
+    auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
+        mlir::presburger::BoundType::LB, var);
+    if (!failed(lbBound)) {
+      uint64_t tripCount = lbBound.value();
       // 0 iteration loops (greatest divisor is 2^64 - 1).
       if (tripCount == 0)
         thisGcd = std::numeric_limits<uint64_t>::max();
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index f62d01d719633..f5e30a278f06b 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -250,34 +250,26 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                  SetIntRangeFn setResultRange) {
   auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
-                      Value idxResult, Value size) {
+                      Value idxResult) {
     if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
       return;
-    APInt sizeInt;
-    if (matchPattern(size, m_ConstantInt(&sizeInt))) {
-      ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
-      setResultRange(dimResult, dimRange);
-      ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
-      setResultRange(idxResult, idxRange);
-    } else {
-      ConstantIntRanges dimRange =
-          argRange.intersection(getIndexRange(1, kMaxDim));
-      setResultRange(dimResult, dimRange);
-      ConstantIntRanges idxRange =
-          getIndexRange(0, dimRange.umax().getZExtValue() - 1);
-      setResultRange(idxResult, idxRange);
-    }
+    ConstantIntRanges dimRange =
+        argRange.intersection(getIndexRange(1, kMaxDim));
+    setResultRange(dimResult, dimRange);
+    ConstantIntRanges idxRange =
+        getIndexRange(0, dimRange.umax().getZExtValue() - 1);
+    setResultRange(idxResult, idxRange);
   };
 
   argRanges = argRanges.drop_front(getAsyncDependencies().size());
   KernelDim3 gridDims = getGridSize();
   KernelDim3 blockIds = getBlockIds();
-  setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
-  setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
-  setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
+  setRange(argRanges[0], gridDims.x, blockIds.x);
+  setRange(argRanges[1], gridDims.y, blockIds.y);
+  setRange(argRanges[2], gridDims.z, blockIds.z);
   KernelDim3 blockDims = getBlockSize();
   KernelDim3 threadIds = getThreadIds();
-  setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
-  setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
-  setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
+  setRange(argRanges[3], blockDims.x, threadIds.x);
+  setRange(argRanges[4], blockDims.y, threadIds.y);
+  setRange(argRanges[5], blockDims.z, threadIds.z);
 }
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 87f883c2e6485..f4408fa9417b5 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -646,7 +646,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   // Compute constant bound for `valueDim`.
   int64_t ubAdjustment = closedUB ? 0 : 1;
   if (auto bound = cstr.cstr.getConstantBound64(type, pos))
-    return type == BoundType::UB ? *bound + ubAdjustment : *bound;
+    if (bound.has_value())
+      return type == BoundType::UB ? *bound + ubAdjustment : *bound;
   return failure();
 }
 

From 82e48ee7ed45cda6b0d410e59097c02bead73a58 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Tue, 18 Mar 2025 10:54:27 +0800
Subject: [PATCH 5/8] update getKnownTripCountBound function name and rename
 tripCount to minTripCount.

---
 mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp | 11 ++++-------
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp       |  5 +++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index f1a723c919f7e..c8f38cfd8c328 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -216,9 +216,8 @@ void mlir::affine::getTripCountMapAndOperands(
 
 /// Take the min if all trip counts are constant.
 static std::optional<uint64_t>
-getConstantTripCountFromAffineMap(AffineMap map,
-                                  SmallVectorImpl<Value> &operands,
-                                  presburger::BoundType type) {
+getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
+                       presburger::BoundType type) {
   std::optional<uint64_t> tripCount;
   for (auto resultExpr : map.getResults()) {
     AffineMap subMap =
@@ -260,8 +259,7 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
-  return getConstantTripCountFromAffineMap(map, operands,
-                                           presburger::BoundType::LB);
+  return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
 }
 
 /// Returns the maximum trip count when the operand of forOp has a range. If the
@@ -275,8 +273,7 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
-  return getConstantTripCountFromAffineMap(map, operands,
-                                           presburger::BoundType::UB);
+  return getKnownTripCountBound(map, operands, presburger::BoundType::UB);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 37e58b1332712..efbc87ec740bb 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -116,9 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
-  std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> minTripCount = getConstantTripCount(forOp);
   std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
-  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
+  if (!minTripCount || *minTripCount != 1 || !maxTripCount ||
+      *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.

From e31ff46584802cc972160f8b1feda6b2f6a5948f Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Wed, 9 Apr 2025 11:47:26 +0800
Subject: [PATCH 6/8] improve doc and nit.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  | 22 ++++----
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   |  9 ++-
 mlir/test/Dialect/Affine/unroll.mlir          | 56 +++++++++----------
 3 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index c8f38cfd8c328..133d7c754589b 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -214,19 +214,21 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
-/// Take the min if all trip counts are constant.
+/// The function make map be computed with the given operands to get the value
+/// of trip, which may have a range when a range exists for either operand.
+/// If type is equal to BoundType::LB get the minimum value of the trip, if type
+/// is equal to BoundType::UB get the maximum value of the trip.
 static std::optional<uint64_t>
 getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
                        presburger::BoundType type) {
   std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    AffineMap subMap =
-        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+  for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) {
+    AffineMap subMap = map.getSubMap(i);
     ValueBoundsConstraintSet::Variable var(subMap, operands);
     auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
         mlir::presburger::BoundType::LB, var);
     auto ubBound = ValueBoundsConstraintSet::computeConstantBound(
-        mlir::presburger::BoundType::UB, var, nullptr, true);
+        mlir::presburger::BoundType::UB, var, nullptr, /*closedUB*/ true);
     if (failed(lbBound) || failed(ubBound))
       return std::nullopt;
     if (type == presburger::BoundType::LB) {
@@ -238,7 +240,7 @@ getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
     } else if (type == presburger::BoundType::UB) {
       if (tripCount.has_value())
         tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(ubBound.value()));
+            std::max(*tripCount, static_cast<uint64_t>(ubBound.value()));
       else
         tripCount = ubBound.value();
     } else {
@@ -253,7 +255,7 @@ getKnownTripCountBound(AffineMap map, SmallVectorImpl<Value> &operands,
 /// getTripCount) and is able to determine constant trip count in non-trivial
 /// cases.
 std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
-  SmallVector<Value, 4> operands;
+  SmallVector<Value> operands;
   AffineMap map;
   getTripCountMapAndOperands(forOp, &map, &operands);
 
@@ -262,12 +264,12 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
   return getKnownTripCountBound(map, operands, presburger::BoundType::LB);
 }
 
-/// Returns the maximum trip count when the operand of forOp has a range. If the
-/// operand of forOp is a constant, the return value is the same as
+/// Returns the maximum trip count when the operand of forOp has a range.
+/// If the operand of forOp is a constant, the return value is the same as
 /// `getConstantTripCount`.
 std::optional<uint64_t>
 mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
-  SmallVector<Value, 4> operands;
+  SmallVector<Value> operands;
   AffineMap map;
   getTripCountMapAndOperands(forOp, &map, &operands);
 
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index efbc87ec740bb..eb7232b9a97d6 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -162,8 +162,7 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  IRRewriter b(forOp.getContext());
-  b.eraseOp(forOp);
+  forOp.erase();
   return success();
 }
 
@@ -895,14 +894,14 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
     return failure();
 
   uint64_t tripCount = *mayBeConstantTripCount;
-  uint64_t maxTripCount = *maxMayBeConstantTripCount;
 
   // Trip equals 0, this loop cannot unroll.
   if (tripCount <= 0)
     return success();
 
-  if (tripCount == 1 && maxTripCount == 1)
-    return promoteIfSingleIteration(forOp);
+  if (succeeded(promoteIfSingleIteration(forOp)))
+    return success();
+
   return loopUnrollByFactor(forOp, tripCount);
 }
 
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index ab73c5ac7e9c4..857a25f5d8567 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -261,14 +261,14 @@ gpu.module @unroll_full {
 
 // UNROLL-FULL-LABEL: func @thread_partial_execution
 func.func @thread_partial_execution() {
-  %0 = arith.constant 0 :index
-  %1 = arith.constant 2 : index    
+  %c0 = arith.constant 0 :index
+  %c2 = arith.constant 2 : index    
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
-    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
-      %3 = arith.addi %arg, %0 : index
-      affine.yield %3 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index {
+      %sum = arith.addi %arg, %c0 : index
+      affine.yield %sum : index
     }
     // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
     // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
@@ -281,15 +281,15 @@ func.func @thread_partial_execution() {
 
 // UNROLL-FULL-LABEL: func @unroll_all_thread
 func.func @unroll_all_thread() {
-  %0 = arith.constant 0 :index
-  %1 = arith.constant 2 : index
+  %c0 = arith.constant 0 :index
+  %c2 = arith.constant 2 : index
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
     %threadid = gpu.thread_id x
-    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
-      %3 = arith.addi %arg, %0 : index
-      affine.yield %3 : index
+    affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index {
+      %sum = arith.addi %arg, %c0 : index
+      affine.yield %sum : index
     }
     // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
     // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
@@ -301,15 +301,15 @@ func.func @unroll_all_thread() {
 
 // UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
 func.func @partial_unroll_factor_4() {
-  %0 = arith.constant 0 :index
-  %1 = arith.constant 2 : index
+  %c0 = arith.constant 0 :index
+  %c2 = arith.constant 2 : index
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
     %threadid = gpu.thread_id x
-    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
-      %3 = arith.addi %arg, %0 : index
-      affine.yield %3 : index
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index {
+      %sum = arith.addi %arg, %c0 : index
+      affine.yield %sum : index
     }
     gpu.terminator
   }
@@ -769,15 +769,15 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
 
 // UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
 func.func @gpu_launch_unroll_by_factor_4() {
-  %0 = arith.constant 0 :index
-  %1 = arith.constant 2 : index
+  %c0 = arith.constant 0 :index
+  %c2 = arith.constant 2 : index
   // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
-             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
+             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
     %threadid = gpu.thread_id x
-    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
-      %3 = arith.addi %arg, %0 : index
-      affine.yield %3 : index
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index {
+      %sum = arith.addi %arg, %c0 : index
+      affine.yield %sum : index
     }
     gpu.terminator
   }

From e58e115e8a385615bb7793c81ee81c2369f14d19 Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Wed, 9 Apr 2025 17:48:35 +0800
Subject: [PATCH 7/8] fix test.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  |  8 +++----
 mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp   |  5 +++-
 mlir/test/Dialect/Affine/unroll.mlir          | 24 ++++++++++---------
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 133d7c754589b..8b5b64b7092eb 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -293,10 +293,9 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");
   std::optional<uint64_t> gcd;
-  for (auto resultExpr : map.getResults()) {
+  for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) {
     uint64_t thisGcd;
-    AffineMap subMap =
-        AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
+    AffineMap subMap = map.getSubMap(i);
     ValueBoundsConstraintSet::Variable var(subMap, operands);
     auto lbBound = ValueBoundsConstraintSet::computeConstantBound(
         mlir::presburger::BoundType::LB, var);
@@ -310,7 +309,8 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
         thisGcd = tripCount;
     } else {
       // Trip count is not a known constant; return its largest known divisor.
-      thisGcd = resultExpr.getLargestKnownDivisor();
+      thisGcd = map.getResult(i).getLargestKnownDivisor();
+      ;
     }
     if (gcd.has_value())
       gcd = std::gcd(*gcd, thisGcd);
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index eb7232b9a97d6..84039804fa66a 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -1048,7 +1048,10 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   }
 
   // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
-  if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
+  // If the trip count has a range, a clean up loop needs to be generated.
+  if ((mayBeConstantTripCount && maxMayBeConstantTripCount &&
+       *mayBeConstantTripCount != *maxMayBeConstantTripCount) ||
+      getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) {
     // Loops where the lower bound is a max expression or the upper bound is
     // a min expression and the trip count doesn't divide the unroll factor
     // can't be unrolled since the lower bound of the cleanup loop in such cases
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 857a25f5d8567..24df89bf8a76e 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -12,6 +12,7 @@
 // UNROLL-FULL-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 1)>
 // UNROLL-FULL-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)>
 // UNROLL-FULL-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
+// UNROLL-FULL-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 9) ceildiv 2) floordiv 4) * 8)>
 
 // SHORT-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
 
@@ -22,8 +23,8 @@
 // UNROLL-BY-4-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)>
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
-// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
-// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP8:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -314,12 +315,13 @@ func.func @partial_unroll_factor_4() {
     gpu.terminator
   }
   // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
-  // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
-  // UNROLL-FULL-NEXT:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
-  // UNROLL-FULL-NEXT:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
-  // UNROLL-FULL-NEXT:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
-  // UNROLL-FULL-NEXT:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_4]] : index
   // UNROLL-FULL-NEXT: }
   return
 }
@@ -536,7 +538,7 @@ func.func @loop_nest_operand1() {
 // UNROLL-BY-4-LABEL: func @loop_nest_operand2() {
 func.func @loop_nest_operand2() {
 // UNROLL-BY-4:      affine.for %arg0 = 0 to 100 step 2 {
-// UNROLL-BY-4-NEXT:   affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
+// UNROLL-BY-4-NEXT:   affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
 // UNROLL-BY-4-NEXT:     %0 = "foo"() : () -> i32
 // UNROLL-BY-4-NEXT:     %1 = "foo"() : () -> i32
 // UNROLL-BY-4-NEXT:     %2 = "foo"() : () -> i32
@@ -582,7 +584,7 @@ func.func @floordiv_mod_ub(%M : index, %N : index) {
 func.func @loop_nest_operand3() {
   // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 {
   affine.for %i = 0 to 100 step 2 {
-    // UNROLL-BY-4: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
+    // UNROLL-BY-4: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 {
     // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
     // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
     // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
@@ -786,7 +788,7 @@ func.func @gpu_launch_unroll_by_factor_4() {
   // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
   // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
   // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
   // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
   // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
   // UNROLL-BY-4-NEXT: }

From 80243dd4fc8a77c3bc301de380a1e52da345049a Mon Sep 17 00:00:00 2001
From: linuxlonelyeagle <2020382038@qq.com>
Date: Mon, 4 Aug 2025 07:22:53 +0000
Subject: [PATCH 8/8] use test.value_with_bounds op update test.

---
 .../Dialect/Affine/Analysis/LoopAnalysis.cpp  |  7 +-
 mlir/test/Dialect/Affine/unroll.mlir          | 89 ++++++++-----------
 2 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
index 4815600e8fa54..df1c156e22075 100644
--- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -60,8 +60,8 @@ class DirectedOpGraph {
 
   void printEdges() {
     for (auto &en : edges) {
-      llvm::dbgs() << *en.first << " (" << en.first << ")"
-                   << " has " << en.second.size() << " edges:\n";
+      llvm::dbgs() << *en.first << " (" << en.first << ")" << " has "
+                   << en.second.size() << " edges:\n";
       for (auto *node : en.second) {
         llvm::dbgs() << '\t' << *node->op << '\n';
       }
@@ -72,7 +72,7 @@ class DirectedOpGraph {
   /// A node of a directed graph between MLIR Operations to model various
   /// relationships. This is meant to be used internally.
   struct DGNode {
-    DGNode(Operation *op) : op(op) {};
+    DGNode(Operation *op) : op(op){};
     Operation *op;
 
     // Start and finish visit numbers are standard in DFS to implement things
@@ -310,7 +310,6 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
     } else {
       // Trip count is not a known constant; return its largest known divisor.
       thisGcd = map.getResult(i).getLargestKnownDivisor();
-      ;
     }
     if (gcd.has_value())
       gcd = std::gcd(*gcd, thisGcd);
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 24df89bf8a76e..baffb1ba0799b 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -260,72 +260,59 @@ gpu.module @unroll_full {
   }
 }
 
-// UNROLL-FULL-LABEL: func @thread_partial_execution
-func.func @thread_partial_execution() {
+// UNROLL-FULL-LABEL: func @bound_unroll_partial
+func.func @bound_unroll_partial() {
   %c0 = arith.constant 0 :index
-  %c2 = arith.constant 2 : index    
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
-    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index {
+  %bound = test.value_with_bounds { min = 0 : index, max = 1 : index}
+  affine.for %iv = %bound to 3 step 2 iter_args(%arg = %c0) -> index {
       %sum = arith.addi %arg, %c0 : index
       affine.yield %sum : index
-    }
-    // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
-    // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
-    // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
-    // UNROLL-FULL-NEXT: }
-    gpu.terminator
   }
+  // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
+  // UNROLL-FULL-NEXT: }
   return
 }
 
-// UNROLL-FULL-LABEL: func @unroll_all_thread
-func.func @unroll_all_thread() {
+// UNROLL-FULL-LABEL: func @bound_unroll_all
+func.func @bound_unroll_all() {
   %c0 = arith.constant 0 :index
-  %c2 = arith.constant 2 : index
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
-    %threadid = gpu.thread_id x
-    affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index {
-      %sum = arith.addi %arg, %c0 : index
-      affine.yield %sum : index
-    }
-    // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
-    // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
-    // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
-    gpu.terminator
+  %bound = test.value_with_bounds { min = 0 : index, max = 1 : index}
+  affine.for %iv = %bound to 6 step 2 iter_args(%arg = %c0) -> index {
+    %sum = arith.addi %arg, %c0 : index
+    affine.yield %sum : index
   }
+  // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
   return
 }
 
-// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
-func.func @partial_unroll_factor_4() {
-  %c0 = arith.constant 0 :index
-  %c2 = arith.constant 2 : index
+// UNROLL-FULL-LABEL: func.func @bound_partial_unroll_factor_4
+func.func @bound_partial_unroll_factor_4() {
+  %c0 = arith.constant 0 :index 
   // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
-    %threadid = gpu.thread_id x
-    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index {
-      %sum = arith.addi %arg, %c0 : index
-      affine.yield %sum : index
-    }
-    gpu.terminator
+  // UNROLL-FULL: %[[Bound:.*]] = test.value_with_bounds {max = 1 : index, min = 0 : index}
+  %bound = test.value_with_bounds { min = 0 : index, max = 1 : index}
+  affine.for %iv = %bound to 9 step 2 iter_args(%arg = %c0) -> index {
+    %sum = arith.addi %arg, %c0 : index
+    affine.yield %sum : index
   }
-  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
   // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
   // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
   // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
   // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[Bound]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
   // UNROLL-FULL-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
   // UNROLL-FULL-NEXT:   affine.yield %[[SUM_4]] : index
   // UNROLL-FULL-NEXT: }
   return
 }
 
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -769,32 +756,28 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
-// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
-func.func @gpu_launch_unroll_by_factor_4() {
+// UNROLL-BY-4-LABEL: func @bound_unroll_by_factor_4
+func.func @bound_unroll_by_factor_4() {
   %c0 = arith.constant 0 :index
-  %c2 = arith.constant 2 : index
   // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
-  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2)
-             threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) {
-    %threadid = gpu.thread_id x
-    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index {
-      %sum = arith.addi %arg, %c0 : index
-      affine.yield %sum : index
-    }
-    gpu.terminator
+  %bound = test.value_with_bounds { min = 0 : index, max = 1 : index}
+  // UNROLL-BY-4: %[[Bound:.*]] = test.value_with_bounds {max = 1 : index, min = 0 : index}
+  affine.for %iv = %bound to 11 step 2 iter_args(%arg = %c0) -> index {
+    %sum = arith.addi %arg, %c0 : index
+    affine.yield %sum : index
   }
-  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
   // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
   // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
   // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
   // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
-  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[Bound]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
   // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
   // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
   // UNROLL-BY-4-NEXT: }
   return
 }
 
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0