llvm · linuxlonelyeagle · Feb 17, 2025 · Feb 22, 2025 · Feb 26, 2025 · Feb 28, 2025
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
@@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
 /// constant trip count in non-trivial cases.
 std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);
 
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t> getMaxConstantTripCount(AffineForOp forOp);
+
 /// Returns the greatest known integral divisor of the trip count. Affine
 /// expression analysis is used (indirectly through getTripCount), and
 /// this method is thus able to determine non-trivial divisors.

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     static StringRef getNumWorkgroupAttributionsAttrName() {
       return "workgroup_attributions";
     }
+
+    /// Find BlockSize via the BlockArgument of gpu.launch.
+    Value getBlockSizeOnAxis(Value threadId);
+
+    /// Find BlockSize via the Dimension Information.
+    Value getBlockSizeOnAxis(Dimension dimension);
   }];
 
   let hasCanonicalizer = 1;

diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "llvm/Support/MathExtras.h"
 
 #include "llvm/ADT/DenseSet.h"
@@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands(
                             tripCountValueMap.getOperands().end());
 }
 
+/// Replace thread_id with its maximum value, if `replaceWithZero` is true,
+/// thread_id will be replaced by its minimum value 0.
+static void replaceGPUOperands(AffineForOp forOp,
+                               SmallVectorImpl<Value> &operands,
+                               SmallVectorImpl<AffineExpr> &symReplacements,
+                               unsigned numDim, bool replaceWithZero = false) {
+  auto launchOp = forOp->getParentOfType<gpu::LaunchOp>();
+  if (!launchOp)
+    return;
+
+  // `b` is only used to create `AffineExpr`.
+  Builder b(forOp.getContext());
+  unsigned idx = 0;
+
+  for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
+    Value operand = operands[i];
+    if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) {
+      operands[i] = blockSize;
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+
+    Operation *defOp = operand.getDefiningOp();
+    if (!defOp) {
+      ++idx;
+      continue;
+    }
+
+    if (auto threadIdOp = mlir::dyn_cast<gpu::ThreadIdOp>(defOp)) {
+      gpu::Dimension dimension = threadIdOp.getDimension();
+      operands[i] = launchOp.getBlockSizeOnAxis(dimension);
+      if (!replaceWithZero)
+        symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1);
+      else
+        symReplacements.push_back(b.getAffineConstantExpr(0));
+      continue;
+    }
+    ++idx;
+  }
+}
+
+/// Take the min if all trip counts are constant.
+static std::optional<uint64_t>
+getConstantTripCountFromAffineMap(AffineMap map) {
+  std::optional<uint64_t> tripCount;
+  for (auto resultExpr : map.getResults()) {
+    auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
+    if (!constExpr)
+      return std::nullopt;
+    if (tripCount.has_value())
+      tripCount =
+          std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
+    else
+      tripCount = constExpr.getValue();
+  }
+  return tripCount;
+}
+
 /// Returns the trip count of the loop if it's a constant, std::nullopt
 /// otherwise. This method uses affine expression analysis (in turn using
 /// getTripCount) and is able to determine constant trip count in non-trivial
@@ -95,20 +157,34 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {
 
   if (!map)
     return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
+}
 
-  // Take the min if all trip counts are constant.
-  std::optional<uint64_t> tripCount;
-  for (auto resultExpr : map.getResults()) {
-    if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
-      if (tripCount.has_value())
-        tripCount =
-            std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
-      else
-        tripCount = constExpr.getValue();
-    } else
-      return std::nullopt;
-  }
-  return tripCount;
+/// In some scenarios, such as GPU, the number of trip of each thread in the
+/// loop is inconsistent. This function returns the maximum number of trip.
+std::optional<uint64_t>
+mlir::affine::getMaxConstantTripCount(AffineForOp forOp) {
+  SmallVector<Value, 4> operands;
+  AffineMap map;
+  getTripCountMapAndOperands(forOp, &map, &operands);
+
+  if (!map)
+    return std::nullopt;
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true);
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
+  return getConstantTripCountFromAffineMap(map);
 }
 
 /// Returns the greatest known integral divisor of the trip count. Affine
@@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {
 
   if (!map)
     return 1;
-
+  SmallVector<AffineExpr, 4> symReplacements;
+  replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims());
+  map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
+                                  map.getNumSymbols());
+  affine::AffineValueMap valueMap(map, operands);
+  (void)valueMap.canonicalize();
+  map = valueMap.getAffineMap();
   // The largest divisor of the trip count is the GCD of the individual largest
   // divisors.
   assert(map.getNumResults() >= 1 && "expected one or more results");

diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
@@ -117,7 +118,8 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
 /// was known to have a single iteration.
 LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
-  if (!tripCount || *tripCount != 1)
+  std::optional<uint64_t> maxTripCount = getMaxConstantTripCount(forOp);
+  if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
     return failure();
 
   // TODO: extend this for arbitrary affine bounds.
@@ -160,7 +162,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   forOp.getBody()->back().erase();
   parentBlock->getOperations().splice(Block::iterator(forOp),
                                       forOp.getBody()->getOperations());
-  forOp.erase();
+  IRRewriter b(forOp.getContext());
+  b.eraseOp(forOp);
   return success();
 }
 
@@ -884,15 +887,23 @@ void mlir::affine::getTileableBands(
 /// Unrolls this loop completely.
 LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
-  if (mayBeConstantTripCount.has_value()) {
-    uint64_t tripCount = *mayBeConstantTripCount;
-    if (tripCount == 0)
-      return success();
-    if (tripCount == 1)
-      return promoteIfSingleIteration(forOp);
-    return loopUnrollByFactor(forOp, tripCount);
-  }
-  return failure();
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
+
+  if (!mayBeConstantTripCount.has_value() &&
+      !maxMayBeConstantTripCount.has_value())
+    return failure();
+
+  uint64_t tripCount = *mayBeConstantTripCount;
+  uint64_t maxTripCount = *maxMayBeConstantTripCount;
+
+  // Trip equals 0, this loop cannot unroll.
+  if (tripCount <= 0)
+    return success();
+
+  if (tripCount == 1 && maxTripCount == 1)
+    return promoteIfSingleIteration(forOp);
+  return loopUnrollByFactor(forOp, tripCount);
 }
 
 /// Unrolls this loop by the specified factor or by the trip count (if constant)
@@ -1013,8 +1024,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
   assert(unrollFactor > 0 && "unroll factor should be positive");
 
   std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
+  std::optional<uint64_t> maxMayBeConstantTripCount =
+      getMaxConstantTripCount(forOp);
   if (unrollFactor == 1) {
     if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
+        maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
         failed(promoteIfSingleIteration(forOp)))
       return failure();
     return success();

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -799,6 +799,26 @@ std::optional<KernelDim3> LaunchOp::getClusterSizeOperandValues() {
   return KernelDim3{operands[6], operands[7], operands[8]};
 }
 
+Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) {
+  if (dimension == Dimension::x)
+    return getBlockSizeX();
+  else if (dimension == Dimension::y)
+    return getBlockSizeY();
+  else
+    return getBlockSizeZ();
+}
+
+Value LaunchOp::getBlockSizeOnAxis(Value threadId) {
+  KernelDim3 threadIds = getThreadIds();
+  if (threadIds.x == threadId)
+    return getBlockSizeX();
+  else if (threadIds.y == threadId)
+    return getBlockSizeY();
+  else if (threadIds.z == threadId)
+    return getBlockSizeZ();
+  return {};
+}
+
 LogicalResult LaunchOp::verify() {
   if (!(hasClusterSize()) &&
       (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ()))

diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
@@ -23,6 +23,7 @@
 // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
 // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
 // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
+// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>
 
 // UNROLL-FULL-LABEL: func @loop_nest_simplest() {
 func.func @loop_nest_simplest() {
@@ -258,6 +259,71 @@ gpu.module @unroll_full {
   }
 }
 
+// UNROLL-FULL-LABEL: func @thread_partial_execution
+func.func @thread_partial_execution() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index    
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+    // UNROLL-FULL-NEXT:   %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+    // UNROLL-FULL-NEXT:   affine.yield %[[SUM]] : index
+    // UNROLL-FULL-NEXT: }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func @unroll_all_thread
+func.func @unroll_all_thread() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+    // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
+func.func @partial_unroll_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
+  // UNROLL-FULL-NEXT:   %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-FULL-NEXT:   affine.yield %[[SUM_3]] : index
+  // UNROLL-FULL-NEXT: }
+  return
+}
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {
@@ -701,6 +767,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
   return %sum : f32
 }
 
+// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
+func.func @gpu_launch_unroll_by_factor_4() {
+  %0 = arith.constant 0 :index
+  %1 = arith.constant 2 : index
+  // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
+             threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
+    %threadid = gpu.thread_id x
+    affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
+      %3 = arith.addi %arg, %0 : index
+      affine.yield %3 : index
+    }
+    gpu.terminator
+  }
+  // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id  x
+  // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
+  // UNROLL-BY-4-NEXT:   %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
+  // UNROLL-BY-4-NEXT:   affine.yield %[[SUM_4]] : index
+  // UNROLL-BY-4-NEXT: }
+  return
+}
+
 // UNROLL-FULL: func @unroll_zero_trip_count_case
 func.func @unroll_zero_trip_count_case() {
   // CHECK-NEXT: affine.for %{{.*}} = 0 to 0