llvm · linuxlonelyeagle · Feb 17, 2025 · Feb 22, 2025 · Feb 26, 2025 · Feb 28, 2025
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -12,12 +12,15 @@
 
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"
 
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -30,6 +33,7 @@
 
 using namespace mlir;
 using namespace mlir::affine;
+using namespace mlir::dataflow;
 
 #define DEBUG_TYPE "affine-loop-analysis"
 
@@ -84,6 +88,73 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
+/// the `symReplacements` with range. If `replaceByMin` is set to true,
+/// construct `replacement` using the smallest value.By default, the largest
+/// value will be used for constructing `replacement`.
+static void replaceOperandByRange(AffineForOp forOp,
+                                  SmallVectorImpl<Value> &operands,
+                                  SmallVectorImpl<AffineExpr> &symReplacements,
+                                  unsigned numDim, bool replaceByMin = false) {
+  DataFlowSolver solver;
+  solver.load<DeadCodeAnalysis>();
+  solver.load<IntegerRangeAnalysis>();
+  if (failed(solver.initializeAndRun(
+          forOp->getParentOfType<FunctionOpInterface>())))
+    return;
+
+  // `b` is used to create affineExpr
+  Builder b(forOp.getContext());
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    auto lattice =
+        solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
+    if (!lattice) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
+      continue;
+    }
+
+    if (lattice->getValue().isUninitialized()) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
+      continue;
+    }
+
+    ConstantIntRanges range = lattice->getValue().getValue();
+    APInt max = range.smax();
+    APInt min = range.smin();
+    unsigned bitNums = max.getBitWidth();
+
+    if (APInt::getSignedMaxValue(bitNums) == max &&
+        APInt::getSignedMinValue(bitNums) == min) {
+      symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
+      continue;
+    }
+
+    if (!replaceByMin)
+      symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
+    else
+      symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
+  }
+  return;
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +166,30 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// Returns the maximum trip count when the operand of forOp has a range. If the
+/// operand of forOp is a constant, the return value is the same as
+/// `getConstantTripCount`.
+std::optional<uint64_t>
+mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
+                        true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +202,10 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");

diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -117,7 +117,8 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +161,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +886,23 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getUpperBoundOnTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // Trip equals 0, this loop cannot unroll.
+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1023,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getUpperBoundOnTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();

diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -250,26 +250,34 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
                                  SetIntRangeFn setResultRange) {
   auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
-                      Value idxResult) {
+                      Value idxResult, Value size) {
     if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
       return;
-    ConstantIntRanges dimRange =
-        argRange.intersection(getIndexRange(1, kMaxDim));
-    setResultRange(dimResult, dimRange);
-    ConstantIntRanges idxRange =
-        getIndexRange(0, dimRange.umax().getZExtValue() - 1);
-    setResultRange(idxResult, idxRange);
+    APInt sizeInt;
+    if (matchPattern(size, m_ConstantInt(&sizeInt))) {
+      ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
+      setResultRange(dimResult, dimRange);
+      ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
+      setResultRange(idxResult, idxRange);
+    } else {
+      ConstantIntRanges dimRange =
+          argRange.intersection(getIndexRange(1, kMaxDim));
+      setResultRange(dimResult, dimRange);
+      ConstantIntRanges idxRange =
+          getIndexRange(0, dimRange.umax().getZExtValue() - 1);
+      setResultRange(idxResult, idxRange);
+    }
   };
 
   argRanges = argRanges.drop_front(getAsyncDependencies().size());
   KernelDim3 gridDims = getGridSize();
   KernelDim3 blockIds = getBlockIds();
-  setRange(argRanges[0], gridDims.x, blockIds.x);
-  setRange(argRanges[1], gridDims.y, blockIds.y);
-  setRange(argRanges[2], gridDims.z, blockIds.z);
+  setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
+  setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
+  setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
   KernelDim3 blockDims = getBlockSize();
   KernelDim3 threadIds = getThreadIds();
-  setRange(argRanges[3], blockDims.x, threadIds.x);
-  setRange(argRanges[4], blockDims.y, threadIds.y);
-  setRange(argRanges[5], blockDims.z, threadIds.z);
+  setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
+  setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
+  setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
 }
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,71 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL-NEXT: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL-NEXT: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +767,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4-NEXT: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0