From 23b3a7fe966996aa5c6dd9bb3b9e18c840a33075 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Mon, 17 Feb 2025 16:58:22 +0800 Subject: [PATCH 1/8] support unroll by the gpu.launchOp. --- .../Dialect/Affine/Analysis/LoopAnalysis.h | 4 + mlir/include/mlir/Dialect/Affine/LoopUtils.h | 3 + mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 6 + .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 110 +++++++++++++++--- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 57 +++++++-- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 20 ++++ mlir/test/Dialect/Affine/unroll.mlir | 110 ++++++++++++++++++ 7 files changed, 285 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index ed3c21d952a01..2bd540b9af2eb 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -43,6 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, /// constant trip count in non-trivial cases. std::optional getConstantTripCount(AffineForOp forOp); +/// In the GPU, the number of trip of each thread in the loop is inconsistent. +/// This function returns the maximum number of trip. +std::optional getMaxConstantTripCount(AffineForOp forOp); + /// Returns the greatest known integral divisor of the trip count. Affine /// expression analysis is used (indirectly through getTripCount), and /// this method is thus able to determine non-trivial divisors. diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h index 7fe1f6d48ceeb..1d1d6d94d2382 100644 --- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h @@ -86,6 +86,9 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, /// was known to have a single iteration. LogicalResult promoteIfSingleIteration(AffineForOp forOp); +/// Eliminate loops that will never actually execute. +LogicalResult removeInvalidLoop(AffineForOp forOp); + /// Promotes all single iteration AffineForOp's in the Function, i.e., moves /// their body into the containing Block. void promoteSingleIterationLoops(func::FuncOp f); diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 2b1ce573effd0..940d47c5ef2c8 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1035,6 +1035,12 @@ def GPU_LaunchOp : GPU_Op<"launch", [ static StringRef getNumWorkgroupAttributionsAttrName() { return "workgroup_attributions"; } + + /// Find BlockSize via the BlockArgument of gpu.launch. + Value getBlockSizeOnAxis(Value threadId); + + /// Find BlockSize via the Dimension Information. + Value getBlockSizeOnAxis(Dimension dimension); }]; let hasCanonicalizer = 1; diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 0d4b0ea1668e0..15a5376fa922e 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -18,6 +18,7 @@ #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "llvm/Support/MathExtras.h" #include "llvm/ADT/DenseSet.h" @@ -84,6 +85,67 @@ void mlir::affine::getTripCountMapAndOperands( tripCountValueMap.getOperands().end()); } +/// Replace thread_id with its maximum value, if `replaceWithZero` is true, +/// thread_id will be replaced by its minimum value 0. +static void replaceGPUOperands(AffineForOp forOp, + SmallVectorImpl &operands, + SmallVectorImpl &symReplacements, + unsigned numDim, bool replaceWithZero = false) { + auto launchOp = forOp->getParentOfType(); + if (!launchOp) + return; + + // `b` is only used to create `AffineExpr`. + Builder b(forOp.getContext()); + unsigned idx = 0; + + for (unsigned i = numDim, e = operands.size(); i < e; ++i) { + Value operand = operands[i]; + if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) { + operands[i] = blockSize; + if (!replaceWithZero) + symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1); + else + symReplacements.push_back(b.getAffineConstantExpr(0)); + continue; + } + + Operation *defOp = operand.getDefiningOp(); + if (!defOp) { + ++idx; + continue; + } + + if (auto threadIdOp = mlir::dyn_cast(defOp)) { + gpu::Dimension dimension = threadIdOp.getDimension(); + operands[i] = launchOp.getBlockSizeOnAxis(dimension); + if (!replaceWithZero) + symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1); + else + symReplacements.push_back(b.getAffineConstantExpr(0)); + continue; + } + ++idx; + } +} + +/// Take the min if all trip counts are constant. +static std::optional +getConstantTripCountFromAffineMap(AffineMap map) { + std::optional tripCount; + for (auto resultExpr : map.getResults()) { + auto constExpr = dyn_cast(resultExpr); + if (!constExpr) + return std::nullopt; + if (tripCount.has_value()) + tripCount = + std::min(*tripCount, static_cast(constExpr.getValue())); + else + tripCount = constExpr.getValue(); + } + return tripCount; +} + /// Returns the trip count of the loop if it's a constant, std::nullopt /// otherwise. This method uses affine expression analysis (in turn using /// getTripCount) and is able to determine constant trip count in non-trivial @@ -95,20 +157,34 @@ std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { if (!map) return std::nullopt; + SmallVector symReplacements; + replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims()); + map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), + map.getNumSymbols()); + affine::AffineValueMap valueMap(map, operands); + (void)valueMap.canonicalize(); + map = valueMap.getAffineMap(); + return getConstantTripCountFromAffineMap(map); +} - // Take the min if all trip counts are constant. - std::optional tripCount; - for (auto resultExpr : map.getResults()) { - if (auto constExpr = dyn_cast(resultExpr)) { - if (tripCount.has_value()) - tripCount = - std::min(*tripCount, static_cast(constExpr.getValue())); - else - tripCount = constExpr.getValue(); - } else - return std::nullopt; - } - return tripCount; +/// In some scenarios, such as GPU, the number of trip of each thread in the +/// loop is inconsistent. This function returns the maximum number of trip. +std::optional +mlir::affine::getMaxConstantTripCount(AffineForOp forOp) { + SmallVector operands; + AffineMap map; + getTripCountMapAndOperands(forOp, &map, &operands); + + if (!map) + return std::nullopt; + SmallVector symReplacements; + replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true); + map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), + map.getNumSymbols()); + affine::AffineValueMap valueMap(map, operands); + (void)valueMap.canonicalize(); + map = valueMap.getAffineMap(); + return getConstantTripCountFromAffineMap(map); } /// Returns the greatest known integral divisor of the trip count. Affine @@ -121,7 +197,13 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { if (!map) return 1; - + SmallVector symReplacements; + replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims()); + map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), + map.getNumSymbols()); + affine::AffineValueMap valueMap(map, operands); + (void)valueMap.canonicalize(); + map = valueMap.getAffineMap(); // The largest divisor of the trip count is the GCD of the individual largest // divisors. assert(map.getNumResults() >= 1 && "expected one or more results"); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 4e02559a08949..69ceb0f80095b 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/IRMapping.h" @@ -113,11 +114,29 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); } +/// Eliminate loops that will never actually execute +LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) { + std::optional tripCount = getConstantTripCount(forOp); + std::optional maxTripCount = getMaxConstantTripCount(forOp); + if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0) + return failure(); + + auto iterOperands = forOp.getInits(); + auto results = forOp.getResults(); + for (auto [result, operand] : llvm::zip(results, iterOperands)) + result.replaceAllUsesWith(operand); + + IRRewriter b(forOp); + b.eraseOp(forOp); + return success(); +} + /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { std::optional tripCount = getConstantTripCount(forOp); - if (!tripCount || *tripCount != 1) + std::optional maxTripCount = getMaxConstantTripCount(forOp); + if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1) return failure(); // TODO: extend this for arbitrary affine bounds. @@ -160,7 +179,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { forOp.getBody()->back().erase(); parentBlock->getOperations().splice(Block::iterator(forOp), forOp.getBody()->getOperations()); - forOp.erase(); + IRRewriter b(forOp.getContext()); + b.eraseOp(forOp); return success(); } @@ -884,15 +904,27 @@ void mlir::affine::getTileableBands( /// Unrolls this loop completely. LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { std::optional mayBeConstantTripCount = getConstantTripCount(forOp); - if (mayBeConstantTripCount.has_value()) { - uint64_t tripCount = *mayBeConstantTripCount; - if (tripCount == 0) - return success(); - if (tripCount == 1) - return promoteIfSingleIteration(forOp); - return loopUnrollByFactor(forOp, tripCount); - } - return failure(); + std::optional maxMayBeConstantTripCount = + getMaxConstantTripCount(forOp); + + if (!mayBeConstantTripCount.has_value() && + !maxMayBeConstantTripCount.has_value()) + return failure(); + + uint64_t tripCount = *mayBeConstantTripCount; + uint64_t maxTripCount = *maxMayBeConstantTripCount; + + // The values of Trip are all 0, and the invalid loop is deleted. + if (tripCount <= 0 && maxTripCount <= 0) + return removeInvalidLoop(forOp); + + // In special cases, such as in a GPU, only some threads execute this loop. + if (tripCount == 0 && maxTripCount == 1) + return success(); + + if (tripCount == 1 && maxTripCount == 1) + return promoteIfSingleIteration(forOp); + return loopUnrollByFactor(forOp, tripCount); } /// Unrolls this loop by the specified factor or by the trip count (if constant) @@ -1013,8 +1045,11 @@ LogicalResult mlir::affine::loopUnrollByFactor( assert(unrollFactor > 0 && "unroll factor should be positive"); std::optional mayBeConstantTripCount = getConstantTripCount(forOp); + std::optional maxMayBeConstantTripCount = + getMaxConstantTripCount(forOp); if (unrollFactor == 1) { if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && + maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 && failed(promoteIfSingleIteration(forOp))) return failure(); return success(); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index d06f10d3137a1..31051ed7e55a2 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -799,6 +799,26 @@ std::optional LaunchOp::getClusterSizeOperandValues() { return KernelDim3{operands[6], operands[7], operands[8]}; } +Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) { + if (dimension == Dimension::x) + return getBlockSizeX(); + else if (dimension == Dimension::y) + return getBlockSizeY(); + else + return getBlockSizeZ(); +} + +Value LaunchOp::getBlockSizeOnAxis(Value threadId) { + KernelDim3 threadIds = getThreadIds(); + if (threadIds.x == threadId) + return getBlockSizeX(); + else if (threadIds.y == threadId) + return getBlockSizeY(); + else if (threadIds.z == threadId) + return getBlockSizeZ(); + return {}; +} + LogicalResult LaunchOp::verify() { if (!(hasClusterSize()) && (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ())) diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 574e9f41494af..a2bb0b2cac4e3 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -23,6 +23,7 @@ // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)> // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)> // UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)> +// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)> // UNROLL-FULL-LABEL: func @loop_nest_simplest() { func.func @loop_nest_simplest() { @@ -258,6 +259,89 @@ gpu.module @unroll_full { } } +// UNROLL-FULL-LABEL: func @thread_partial_execution +func.func @thread_partial_execution() { + %0 = arith.constant 0 :index + %1 = arith.constant 2 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) + threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index { + %3 = arith.addi %arg, %0 : index + affine.yield %3 : index + } + // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL: affine.yield %[[SUM]] : index + // UNROLL-FULL: } + gpu.terminator + } + return +} + +// UNROLL-FULL-LABEL: func @invalid_loop +func.func @invalid_loop() { + %0 = arith.constant 0 :index + %1 = arith.constant 2 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) + threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + %threadid = gpu.thread_id x + affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index { + %3 = arith.addi %arg, %0 : index + affine.yield %3 : index + } + gpu.terminator + // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id x + // UNROLL-FULL-CHECK: gpu.terminator + } + return +} + +// UNROLL-FULL-LABEL: func @unroll_all_thread +func.func @unroll_all_thread() { + %0 = arith.constant 0 :index + %1 = arith.constant 2 : index + // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) + threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + %threadid = gpu.thread_id x + %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index { + %3 = arith.addi %arg, %0 : index + affine.yield %3 : index + } + // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + gpu.terminator + } + return +} + +// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 +func.func @partial_unroll_factor_4() { + %0 = arith.constant 0 :index + %1 = arith.constant 2 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) + threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + %threadid = gpu.thread_id x + affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index { + %3 = arith.addi %arg, %0 : index + affine.yield %3 : index + } + gpu.terminator + } + // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x + // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-FULL: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-FULL: affine.yield %[[SUM_3]] : index + // UNROLL-FULL: } + return +} + // SHORT-LABEL: func @loop_nest_outer_unroll() { func.func @loop_nest_outer_unroll() { // SHORT: affine.for %arg0 = 0 to 4 { @@ -701,6 +785,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32 return %sum : f32 } +// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4 +func.func @gpu_launch_unroll_by_factor_4() { + %0 = arith.constant 0 :index + %1 = arith.constant 2 : index + // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) + threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + %threadid = gpu.thread_id x + affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index { + %3 = arith.addi %arg, %0 : index + affine.yield %3 : index + } + gpu.terminator + } + // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x + // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-BY-4: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-BY-4: affine.yield %[[SUM_4]] : index + // UNROLL-BY-4: } + return +} + // UNROLL-FULL: func @unroll_zero_trip_count_case func.func @unroll_zero_trip_count_case() { // CHECK-NEXT: affine.for %{{.*}} = 0 to 0 From c834f4d70494daa5abac945447b6d307d3900bac Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Sat, 22 Feb 2025 17:53:19 +0800 Subject: [PATCH 2/8] delete the feature of remove invalid loops. --- .../Dialect/Affine/Analysis/LoopAnalysis.h | 4 +- mlir/include/mlir/Dialect/Affine/LoopUtils.h | 3 - mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 2 +- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 25 +------ mlir/test/Dialect/Affine/unroll.mlir | 68 +++++++------------ 5 files changed, 30 insertions(+), 72 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index 2bd540b9af2eb..591533d17c960 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -43,8 +43,8 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, /// constant trip count in non-trivial cases. std::optional getConstantTripCount(AffineForOp forOp); -/// In the GPU, the number of trip of each thread in the loop is inconsistent. -/// This function returns the maximum number of trip. +/// In some scenarios, such as GPU, the number of trip of each thread in the +/// loop is inconsistent. This function returns the maximum number of trip. std::optional getMaxConstantTripCount(AffineForOp forOp); /// Returns the greatest known integral divisor of the trip count. Affine diff --git a/mlir/include/mlir/Dialect/Affine/LoopUtils.h b/mlir/include/mlir/Dialect/Affine/LoopUtils.h index 1d1d6d94d2382..7fe1f6d48ceeb 100644 --- a/mlir/include/mlir/Dialect/Affine/LoopUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopUtils.h @@ -86,9 +86,6 @@ LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp, /// was known to have a single iteration. LogicalResult promoteIfSingleIteration(AffineForOp forOp); -/// Eliminate loops that will never actually execute. -LogicalResult removeInvalidLoop(AffineForOp forOp); - /// Promotes all single iteration AffineForOp's in the Function, i.e., moves /// their body into the containing Block. void promoteSingleIterationLoops(func::FuncOp f); diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index 940d47c5ef2c8..fde1ad482ae2d 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1039,7 +1039,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [ /// Find BlockSize via the BlockArgument of gpu.launch. Value getBlockSizeOnAxis(Value threadId); - /// Find BlockSize via the Dimension Information. + /// Find BlockSize via the Dimension Information. Value getBlockSizeOnAxis(Dimension dimension); }]; diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 69ceb0f80095b..b6471ac179b22 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -114,23 +114,6 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { std::get<0>(e).replaceAllUsesWith(std::get<1>(e)); } -/// Eliminate loops that will never actually execute -LogicalResult mlir::affine::removeInvalidLoop(AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); - std::optional maxTripCount = getMaxConstantTripCount(forOp); - if (!tripCount || *tripCount > 0 || !maxTripCount || *maxTripCount > 0) - return failure(); - - auto iterOperands = forOp.getInits(); - auto results = forOp.getResults(); - for (auto [result, operand] : llvm::zip(results, iterOperands)) - result.replaceAllUsesWith(operand); - - IRRewriter b(forOp); - b.eraseOp(forOp); - return success(); -} - /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { @@ -914,12 +897,8 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { uint64_t tripCount = *mayBeConstantTripCount; uint64_t maxTripCount = *maxMayBeConstantTripCount; - // The values of Trip are all 0, and the invalid loop is deleted. - if (tripCount <= 0 && maxTripCount <= 0) - return removeInvalidLoop(forOp); - - // In special cases, such as in a GPU, only some threads execute this loop. - if (tripCount == 0 && maxTripCount == 1) + // Trip equals 0, this loop cannot unroll. + if (tripCount <= 0) return success(); if (tripCount == 1 && maxTripCount == 1) diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index a2bb0b2cac4e3..ab73c5ac7e9c4 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -270,38 +270,20 @@ func.func @thread_partial_execution() { %3 = arith.addi %arg, %0 : index affine.yield %3 : index } - // UNROLL-FULL: %{{.*}} = affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { - // UNROLL-FULL: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index - // UNROLL-FULL: affine.yield %[[SUM]] : index - // UNROLL-FULL: } + // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index + // UNROLL-FULL-NEXT: } gpu.terminator } return } -// UNROLL-FULL-LABEL: func @invalid_loop -func.func @invalid_loop() { - %0 = arith.constant 0 :index - %1 = arith.constant 2 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) - threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { - %threadid = gpu.thread_id x - affine.for %iv = %tx to 0 step 2 iter_args(%arg = %0) -> index { - %3 = arith.addi %arg, %0 : index - affine.yield %3 : index - } - gpu.terminator - // UNROLL-FULL-CHECK: %{{.*}} = gpu.thread_id x - // UNROLL-FULL-CHECK: gpu.terminator - } - return -} - // UNROLL-FULL-LABEL: func @unroll_all_thread func.func @unroll_all_thread() { %0 = arith.constant 0 :index %1 = arith.constant 2 : index - // UNROLL-FULL-CHECK: %[[C0:.*]] = arith.constant 0 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { %threadid = gpu.thread_id x @@ -309,19 +291,19 @@ func.func @unroll_all_thread() { %3 = arith.addi %arg, %0 : index affine.yield %3 : index } - // UNROLL-FULL-CHECK: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index - // UNROLL-FULL-CHECK: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index - // UNROLL-FULL-CHECK: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index gpu.terminator } return } -// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 +// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 func.func @partial_unroll_factor_4() { %0 = arith.constant 0 :index %1 = arith.constant 2 : index - // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index + // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { %threadid = gpu.thread_id x @@ -332,13 +314,13 @@ func.func @partial_unroll_factor_4() { gpu.terminator } // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x - // UNROLL-FULL: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { - // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index - // UNROLL-FULL: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index - // UNROLL-FULL: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index - // UNROLL-FULL: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-FULL: affine.yield %[[SUM_3]] : index - // UNROLL-FULL: } + // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM_3]] : index + // UNROLL-FULL-NEXT: } return } @@ -800,14 +782,14 @@ func.func @gpu_launch_unroll_by_factor_4() { gpu.terminator } // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x - // UNROLL-BY-4: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index - // UNROLL-BY-4: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index - // UNROLL-BY-4: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index - // UNROLL-BY-4: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-BY-4: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { - // UNROLL-BY-4: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index - // UNROLL-BY-4: affine.yield %[[SUM_4]] : index - // UNROLL-BY-4: } + // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index + // UNROLL-BY-4-NEXT: } return } From e865351424ee36285133ee14ceccd924ea21dda3 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Wed, 26 Feb 2025 10:55:07 +0800 Subject: [PATCH 3/8] use IntegerRangeAnalysis and update launchOp::inferResultRanges. --- .../Dialect/Affine/Analysis/LoopAnalysis.h | 7 +- mlir/include/mlir/Dialect/GPU/IR/GPUOps.td | 6 -- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 90 ++++++++++--------- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 7 +- mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 20 ----- .../GPU/IR/InferIntRangeInterfaceImpls.cpp | 34 ++++--- 6 files changed, 74 insertions(+), 90 deletions(-) diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h index 591533d17c960..f5b6794d42794 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h @@ -43,9 +43,10 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map, /// constant trip count in non-trivial cases. std::optional getConstantTripCount(AffineForOp forOp); -/// In some scenarios, such as GPU, the number of trip of each thread in the -/// loop is inconsistent. This function returns the maximum number of trip. -std::optional getMaxConstantTripCount(AffineForOp forOp); +/// Returns the maximum trip count when the operand of forOp has a range. If the +/// operand of forOp is a constant, the return value is the same as +/// `getConstantTripCount`. +std::optional getUpperBoundOnTripCount(AffineForOp forOp); /// Returns the greatest known integral divisor of the trip count. Affine /// expression analysis is used (indirectly through getTripCount), and diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td index fde1ad482ae2d..2b1ce573effd0 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1035,12 +1035,6 @@ def GPU_LaunchOp : GPU_Op<"launch", [ static StringRef getNumWorkgroupAttributionsAttrName() { return "workgroup_attributions"; } - - /// Find BlockSize via the BlockArgument of gpu.launch. - Value getBlockSizeOnAxis(Value threadId); - - /// Find BlockSize via the Dimension Information. - Value getBlockSizeOnAxis(Dimension dimension); }]; let hasCanonicalizer = 1; diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 15a5376fa922e..5ed11d8bde029 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -12,13 +12,15 @@ #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" #include "mlir/Dialect/Affine/Analysis/NestedMatcher.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Interfaces/FunctionInterfaces.h" #include "llvm/Support/MathExtras.h" #include "llvm/ADT/DenseSet.h" @@ -31,6 +33,7 @@ using namespace mlir; using namespace mlir::affine; +using namespace mlir::dataflow; #define DEBUG_TYPE "affine-loop-analysis" @@ -85,48 +88,54 @@ void mlir::affine::getTripCountMapAndOperands( tripCountValueMap.getOperands().end()); } -/// Replace thread_id with its maximum value, if `replaceWithZero` is true, -/// thread_id will be replaced by its minimum value 0. -static void replaceGPUOperands(AffineForOp forOp, - SmallVectorImpl &operands, - SmallVectorImpl &symReplacements, - unsigned numDim, bool replaceWithZero = false) { - auto launchOp = forOp->getParentOfType(); - if (!launchOp) +/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill +/// the `symReplacements` with range. If `replaceByMin` is set to true, +/// construct `replacement` using the smallest value.By default, the largest +/// value will be used for constructing `replacement`. +static void replaceOperandByRange(AffineForOp forOp, + SmallVectorImpl &operands, + SmallVectorImpl &symReplacements, + unsigned numDim, bool replaceByMin = false) { + DataFlowSolver solver; + solver.load(); + solver.load(); + if (failed(solver.initializeAndRun( + forOp->getParentOfType()))) return; - // `b` is only used to create `AffineExpr`. + // `b` is used to create affineExpr Builder b(forOp.getContext()); - unsigned idx = 0; - for (unsigned i = numDim, e = operands.size(); i < e; ++i) { Value operand = operands[i]; - if (Value blockSize = launchOp.getBlockSizeOnAxis(operand)) { - operands[i] = blockSize; - if (!replaceWithZero) - symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1); - else - symReplacements.push_back(b.getAffineConstantExpr(0)); + auto lattice = + solver.lookupState(operand); + if (!lattice) { + symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); continue; } - Operation *defOp = operand.getDefiningOp(); - if (!defOp) { - ++idx; + if (lattice->getValue().isUninitialized()) { + symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); continue; } - if (auto threadIdOp = mlir::dyn_cast(defOp)) { - gpu::Dimension dimension = threadIdOp.getDimension(); - operands[i] = launchOp.getBlockSizeOnAxis(dimension); - if (!replaceWithZero) - symReplacements.push_back(b.getAffineSymbolExpr(idx++) - 1); - else - symReplacements.push_back(b.getAffineConstantExpr(0)); + ConstantIntRanges range = lattice->getValue().getValue(); + APInt max = range.smax(); + APInt min = range.smin(); + unsigned bitNums = max.getBitWidth(); + + if (APInt::getSignedMaxValue(bitNums) == max && + APInt::getSignedMinValue(bitNums) == min) { + symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); continue; } - ++idx; + + if (!replaceByMin) + symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue())); + else + symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue())); } + return; } /// Take the min if all trip counts are constant. @@ -158,19 +167,17 @@ std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { if (!map) return std::nullopt; SmallVector symReplacements; - replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims()); + replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims()); map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), map.getNumSymbols()); - affine::AffineValueMap valueMap(map, operands); - (void)valueMap.canonicalize(); - map = valueMap.getAffineMap(); return getConstantTripCountFromAffineMap(map); } -/// In some scenarios, such as GPU, the number of trip of each thread in the -/// loop is inconsistent. This function returns the maximum number of trip. +/// Returns the maximum trip count when the operand of forOp has a range. If the +/// operand of forOp is a constant, the return value is the same as +/// `getConstantTripCount`. std::optional -mlir::affine::getMaxConstantTripCount(AffineForOp forOp) { +mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) { SmallVector operands; AffineMap map; getTripCountMapAndOperands(forOp, &map, &operands); @@ -178,12 +185,10 @@ mlir::affine::getMaxConstantTripCount(AffineForOp forOp) { if (!map) return std::nullopt; SmallVector symReplacements; - replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims(), true); + replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(), + true); map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), map.getNumSymbols()); - affine::AffineValueMap valueMap(map, operands); - (void)valueMap.canonicalize(); - map = valueMap.getAffineMap(); return getConstantTripCountFromAffineMap(map); } @@ -198,12 +203,9 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { if (!map) return 1; SmallVector symReplacements; - replaceGPUOperands(forOp, operands, symReplacements, map.getNumDims()); + replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims()); map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), map.getNumSymbols()); - affine::AffineValueMap valueMap(map, operands); - (void)valueMap.canonicalize(); - map = valueMap.getAffineMap(); // The largest divisor of the trip count is the GCD of the individual largest // divisors. assert(map.getNumResults() >= 1 && "expected one or more results"); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index b6471ac179b22..a344bc8f9bffe 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -17,7 +17,6 @@ #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/Affine/Utils.h" #include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/IRMapping.h" @@ -118,7 +117,7 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { std::optional tripCount = getConstantTripCount(forOp); - std::optional maxTripCount = getMaxConstantTripCount(forOp); + std::optional maxTripCount = getUpperBoundOnTripCount(forOp); if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1) return failure(); @@ -888,7 +887,7 @@ void mlir::affine::getTileableBands( LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { std::optional mayBeConstantTripCount = getConstantTripCount(forOp); std::optional maxMayBeConstantTripCount = - getMaxConstantTripCount(forOp); + getUpperBoundOnTripCount(forOp); if (!mayBeConstantTripCount.has_value() && !maxMayBeConstantTripCount.has_value()) @@ -1025,7 +1024,7 @@ LogicalResult mlir::affine::loopUnrollByFactor( std::optional mayBeConstantTripCount = getConstantTripCount(forOp); std::optional maxMayBeConstantTripCount = - getMaxConstantTripCount(forOp); + getUpperBoundOnTripCount(forOp); if (unrollFactor == 1) { if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 && maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 && diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 31051ed7e55a2..d06f10d3137a1 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -799,26 +799,6 @@ std::optional LaunchOp::getClusterSizeOperandValues() { return KernelDim3{operands[6], operands[7], operands[8]}; } -Value LaunchOp::getBlockSizeOnAxis(Dimension dimension) { - if (dimension == Dimension::x) - return getBlockSizeX(); - else if (dimension == Dimension::y) - return getBlockSizeY(); - else - return getBlockSizeZ(); -} - -Value LaunchOp::getBlockSizeOnAxis(Value threadId) { - KernelDim3 threadIds = getThreadIds(); - if (threadIds.x == threadId) - return getBlockSizeX(); - else if (threadIds.y == threadId) - return getBlockSizeY(); - else if (threadIds.z == threadId) - return getBlockSizeZ(); - return {}; -} - LogicalResult LaunchOp::verify() { if (!(hasClusterSize()) && (getClusterSizeX() || getClusterSizeY() || getClusterSizeZ())) diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp index f5e30a278f06b..f62d01d719633 100644 --- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp @@ -250,26 +250,34 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef, void LaunchOp::inferResultRanges(ArrayRef argRanges, SetIntRangeFn setResultRange) { auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult, - Value idxResult) { + Value idxResult, Value size) { if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth) return; - ConstantIntRanges dimRange = - argRange.intersection(getIndexRange(1, kMaxDim)); - setResultRange(dimResult, dimRange); - ConstantIntRanges idxRange = - getIndexRange(0, dimRange.umax().getZExtValue() - 1); - setResultRange(idxResult, idxRange); + APInt sizeInt; + if (matchPattern(size, m_ConstantInt(&sizeInt))) { + ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt); + setResultRange(dimResult, dimRange); + ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1); + setResultRange(idxResult, idxRange); + } else { + ConstantIntRanges dimRange = + argRange.intersection(getIndexRange(1, kMaxDim)); + setResultRange(dimResult, dimRange); + ConstantIntRanges idxRange = + getIndexRange(0, dimRange.umax().getZExtValue() - 1); + setResultRange(idxResult, idxRange); + } }; argRanges = argRanges.drop_front(getAsyncDependencies().size()); KernelDim3 gridDims = getGridSize(); KernelDim3 blockIds = getBlockIds(); - setRange(argRanges[0], gridDims.x, blockIds.x); - setRange(argRanges[1], gridDims.y, blockIds.y); - setRange(argRanges[2], gridDims.z, blockIds.z); + setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX()); + setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY()); + setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ()); KernelDim3 blockDims = getBlockSize(); KernelDim3 threadIds = getThreadIds(); - setRange(argRanges[3], blockDims.x, threadIds.x); - setRange(argRanges[4], blockDims.y, threadIds.y); - setRange(argRanges[5], blockDims.z, threadIds.z); + setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX()); + setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY()); + setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ()); } From 0b30c4e9d9747dd1040280a471df17389eed00cb Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Fri, 28 Feb 2025 20:45:38 +0800 Subject: [PATCH 4/8] use ValueBoundsOpInterface. --- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 117 ++++++------------ .../GPU/IR/InferIntRangeInterfaceImpls.cpp | 34 ++--- .../lib/Interfaces/ValueBoundsOpInterface.cpp | 3 +- 3 files changed, 54 insertions(+), 100 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 5ed11d8bde029..bcb31db6b1a93 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -12,8 +12,6 @@ #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" -#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" -#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h" #include "mlir/Dialect/Affine/Analysis/AffineStructures.h" @@ -21,6 +19,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "llvm/Support/MathExtras.h" #include "llvm/ADT/DenseSet.h" @@ -33,7 +32,6 @@ using namespace mlir; using namespace mlir::affine; -using namespace mlir::dataflow; #define DEBUG_TYPE "affine-loop-analysis" @@ -88,69 +86,37 @@ void mlir::affine::getTripCountMapAndOperands( tripCountValueMap.getOperands().end()); } -/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill -/// the `symReplacements` with range. If `replaceByMin` is set to true, -/// construct `replacement` using the smallest value.By default, the largest -/// value will be used for constructing `replacement`. -static void replaceOperandByRange(AffineForOp forOp, - SmallVectorImpl &operands, - SmallVectorImpl &symReplacements, - unsigned numDim, bool replaceByMin = false) { - DataFlowSolver solver; - solver.load(); - solver.load(); - if (failed(solver.initializeAndRun( - forOp->getParentOfType()))) - return; - - // `b` is used to create affineExpr - Builder b(forOp.getContext()); - for (unsigned i = numDim, e = operands.size(); i < e; ++i) { - Value operand = operands[i]; - auto lattice = - solver.lookupState(operand); - if (!lattice) { - symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); - continue; - } - - if (lattice->getValue().isUninitialized()) { - symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); - continue; - } - - ConstantIntRanges range = lattice->getValue().getValue(); - APInt max = range.smax(); - APInt min = range.smin(); - unsigned bitNums = max.getBitWidth(); - - if (APInt::getSignedMaxValue(bitNums) == max && - APInt::getSignedMinValue(bitNums) == min) { - symReplacements.push_back(b.getAffineSymbolExpr(i - numDim)); - continue; - } - - if (!replaceByMin) - symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue())); - else - symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue())); - } - return; -} - /// Take the min if all trip counts are constant. static std::optional -getConstantTripCountFromAffineMap(AffineMap map) { +getConstantTripCountFromAffineMap(AffineMap map, + SmallVectorImpl &operands, + presburger::BoundType type) { std::optional tripCount; for (auto resultExpr : map.getResults()) { - auto constExpr = dyn_cast(resultExpr); - if (!constExpr) + AffineMap subMap = + AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr); + ValueBoundsConstraintSet::Variable var(subMap, operands); + auto lbBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::LB, var); + auto ubBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::UB, var, nullptr, true); + if (failed(lbBound) || failed(ubBound)) return std::nullopt; - if (tripCount.has_value()) - tripCount = - std::min(*tripCount, static_cast(constExpr.getValue())); - else - tripCount = constExpr.getValue(); + if (type == presburger::BoundType::LB) { + if (tripCount.has_value()) + tripCount = + std::min(*tripCount, static_cast(lbBound.value())); + else + tripCount = lbBound.value(); + } else if (type == presburger::BoundType::UB) { + if (tripCount.has_value()) + tripCount = + std::min(*tripCount, static_cast(ubBound.value())); + else + tripCount = ubBound.value(); + } else { + return std::nullopt; + } } return tripCount; } @@ -166,11 +132,8 @@ std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { if (!map) return std::nullopt; - SmallVector symReplacements; - replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims()); - map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), - map.getNumSymbols()); - return getConstantTripCountFromAffineMap(map); + return getConstantTripCountFromAffineMap(map, operands, + presburger::BoundType::LB); } /// Returns the maximum trip count when the operand of forOp has a range. If the @@ -184,12 +147,8 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) { if (!map) return std::nullopt; - SmallVector symReplacements; - replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(), - true); - map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), - map.getNumSymbols()); - return getConstantTripCountFromAffineMap(map); + return getConstantTripCountFromAffineMap(map, operands, + presburger::BoundType::UB); } /// Returns the greatest known integral divisor of the trip count. Affine @@ -202,18 +161,20 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { if (!map) return 1; - SmallVector symReplacements; - replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims()); - map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(), - map.getNumSymbols()); + // The largest divisor of the trip count is the GCD of the individual largest // divisors. assert(map.getNumResults() >= 1 && "expected one or more results"); std::optional gcd; for (auto resultExpr : map.getResults()) { uint64_t thisGcd; - if (auto constExpr = dyn_cast(resultExpr)) { - uint64_t tripCount = constExpr.getValue(); + AffineMap subMap = + AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr); + ValueBoundsConstraintSet::Variable var(subMap, operands); + auto lbBound = ValueBoundsConstraintSet::computeConstantBound( + mlir::presburger::BoundType::LB, var); + if (!failed(lbBound)) { + uint64_t tripCount = lbBound.value(); // 0 iteration loops (greatest divisor is 2^64 - 1). if (tripCount == 0) thisGcd = std::numeric_limits::max(); diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp index f62d01d719633..f5e30a278f06b 100644 --- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp @@ -250,34 +250,26 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef, void LaunchOp::inferResultRanges(ArrayRef argRanges, SetIntRangeFn setResultRange) { auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult, - Value idxResult, Value size) { + Value idxResult) { if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth) return; - APInt sizeInt; - if (matchPattern(size, m_ConstantInt(&sizeInt))) { - ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt); - setResultRange(dimResult, dimRange); - ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1); - setResultRange(idxResult, idxRange); - } else { - ConstantIntRanges dimRange = - argRange.intersection(getIndexRange(1, kMaxDim)); - setResultRange(dimResult, dimRange); - ConstantIntRanges idxRange = - getIndexRange(0, dimRange.umax().getZExtValue() - 1); - setResultRange(idxResult, idxRange); - } + ConstantIntRanges dimRange = + argRange.intersection(getIndexRange(1, kMaxDim)); + setResultRange(dimResult, dimRange); + ConstantIntRanges idxRange = + getIndexRange(0, dimRange.umax().getZExtValue() - 1); + setResultRange(idxResult, idxRange); }; argRanges = argRanges.drop_front(getAsyncDependencies().size()); KernelDim3 gridDims = getGridSize(); KernelDim3 blockIds = getBlockIds(); - setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX()); - setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY()); - setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ()); + setRange(argRanges[0], gridDims.x, blockIds.x); + setRange(argRanges[1], gridDims.y, blockIds.y); + setRange(argRanges[2], gridDims.z, blockIds.z); KernelDim3 blockDims = getBlockSize(); KernelDim3 threadIds = getThreadIds(); - setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX()); - setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY()); - setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ()); + setRange(argRanges[3], blockDims.x, threadIds.x); + setRange(argRanges[4], blockDims.y, threadIds.y); + setRange(argRanges[5], blockDims.z, threadIds.z); } diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp index 87f883c2e6485..f4408fa9417b5 100644 --- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp +++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp @@ -646,7 +646,8 @@ FailureOr ValueBoundsConstraintSet::computeConstantBound( // Compute constant bound for `valueDim`. int64_t ubAdjustment = closedUB ? 0 : 1; if (auto bound = cstr.cstr.getConstantBound64(type, pos)) - return type == BoundType::UB ? *bound + ubAdjustment : *bound; + if (bound.has_value()) + return type == BoundType::UB ? *bound + ubAdjustment : *bound; return failure(); } From 82e48ee7ed45cda6b0d410e59097c02bead73a58 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Tue, 18 Mar 2025 10:54:27 +0800 Subject: [PATCH 5/8] update getKnownTripCountBound function name and rename tripCount to minTripCount. --- mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp | 11 ++++------- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 5 +++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index f1a723c919f7e..c8f38cfd8c328 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -216,9 +216,8 @@ void mlir::affine::getTripCountMapAndOperands( /// Take the min if all trip counts are constant. static std::optional -getConstantTripCountFromAffineMap(AffineMap map, - SmallVectorImpl &operands, - presburger::BoundType type) { +getKnownTripCountBound(AffineMap map, SmallVectorImpl &operands, + presburger::BoundType type) { std::optional tripCount; for (auto resultExpr : map.getResults()) { AffineMap subMap = @@ -260,8 +259,7 @@ std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { if (!map) return std::nullopt; - return getConstantTripCountFromAffineMap(map, operands, - presburger::BoundType::LB); + return getKnownTripCountBound(map, operands, presburger::BoundType::LB); } /// Returns the maximum trip count when the operand of forOp has a range. If the @@ -275,8 +273,7 @@ mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) { if (!map) return std::nullopt; - return getConstantTripCountFromAffineMap(map, operands, - presburger::BoundType::UB); + return getKnownTripCountBound(map, operands, presburger::BoundType::UB); } /// Returns the greatest known integral divisor of the trip count. Affine diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 37e58b1332712..efbc87ec740bb 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -116,9 +116,10 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) { /// Promotes the loop body of a forOp to its containing block if the forOp /// was known to have a single iteration. LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { - std::optional tripCount = getConstantTripCount(forOp); + std::optional minTripCount = getConstantTripCount(forOp); std::optional maxTripCount = getUpperBoundOnTripCount(forOp); - if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1) + if (!minTripCount || *minTripCount != 1 || !maxTripCount || + *maxTripCount != 1) return failure(); // TODO: extend this for arbitrary affine bounds. From e31ff46584802cc972160f8b1feda6b2f6a5948f Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Wed, 9 Apr 2025 11:47:26 +0800 Subject: [PATCH 6/8] improve doc and nit. --- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 22 ++++---- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 9 ++- mlir/test/Dialect/Affine/unroll.mlir | 56 +++++++++---------- 3 files changed, 44 insertions(+), 43 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index c8f38cfd8c328..133d7c754589b 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -214,19 +214,21 @@ void mlir::affine::getTripCountMapAndOperands( tripCountValueMap.getOperands().end()); } -/// Take the min if all trip counts are constant. +/// The function make map be computed with the given operands to get the value +/// of trip, which may have a range when a range exists for either operand. +/// If type is equal to BoundType::LB get the minimum value of the trip, if type +/// is equal to BoundType::UB get the maximum value of the trip. static std::optional getKnownTripCountBound(AffineMap map, SmallVectorImpl &operands, presburger::BoundType type) { std::optional tripCount; - for (auto resultExpr : map.getResults()) { - AffineMap subMap = - AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr); + for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) { + AffineMap subMap = map.getSubMap(i); ValueBoundsConstraintSet::Variable var(subMap, operands); auto lbBound = ValueBoundsConstraintSet::computeConstantBound( mlir::presburger::BoundType::LB, var); auto ubBound = ValueBoundsConstraintSet::computeConstantBound( - mlir::presburger::BoundType::UB, var, nullptr, true); + mlir::presburger::BoundType::UB, var, nullptr, /*closedUB*/ true); if (failed(lbBound) || failed(ubBound)) return std::nullopt; if (type == presburger::BoundType::LB) { @@ -238,7 +240,7 @@ getKnownTripCountBound(AffineMap map, SmallVectorImpl &operands, } else if (type == presburger::BoundType::UB) { if (tripCount.has_value()) tripCount = - std::min(*tripCount, static_cast(ubBound.value())); + std::max(*tripCount, static_cast(ubBound.value())); else tripCount = ubBound.value(); } else { @@ -253,7 +255,7 @@ getKnownTripCountBound(AffineMap map, SmallVectorImpl &operands, /// getTripCount) and is able to determine constant trip count in non-trivial /// cases. std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { - SmallVector operands; + SmallVector operands; AffineMap map; getTripCountMapAndOperands(forOp, &map, &operands); @@ -262,12 +264,12 @@ std::optional mlir::affine::getConstantTripCount(AffineForOp forOp) { return getKnownTripCountBound(map, operands, presburger::BoundType::LB); } -/// Returns the maximum trip count when the operand of forOp has a range. If the -/// operand of forOp is a constant, the return value is the same as +/// Returns the maximum trip count when the operand of forOp has a range. +/// If the operand of forOp is a constant, the return value is the same as /// `getConstantTripCount`. std::optional mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) { - SmallVector operands; + SmallVector operands; AffineMap map; getTripCountMapAndOperands(forOp, &map, &operands); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index efbc87ec740bb..eb7232b9a97d6 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -162,8 +162,7 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) { forOp.getBody()->back().erase(); parentBlock->getOperations().splice(Block::iterator(forOp), forOp.getBody()->getOperations()); - IRRewriter b(forOp.getContext()); - b.eraseOp(forOp); + forOp.erase(); return success(); } @@ -895,14 +894,14 @@ LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) { return failure(); uint64_t tripCount = *mayBeConstantTripCount; - uint64_t maxTripCount = *maxMayBeConstantTripCount; // Trip equals 0, this loop cannot unroll. if (tripCount <= 0) return success(); - if (tripCount == 1 && maxTripCount == 1) - return promoteIfSingleIteration(forOp); + if (succeeded(promoteIfSingleIteration(forOp))) + return success(); + return loopUnrollByFactor(forOp, tripCount); } diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index ab73c5ac7e9c4..857a25f5d8567 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -261,14 +261,14 @@ gpu.module @unroll_full { // UNROLL-FULL-LABEL: func @thread_partial_execution func.func @thread_partial_execution() { - %0 = arith.constant 0 :index - %1 = arith.constant 2 : index + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) - threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { - affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index { - %3 = arith.addi %arg, %0 : index - affine.yield %3 : index + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { + affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index @@ -281,15 +281,15 @@ func.func @thread_partial_execution() { // UNROLL-FULL-LABEL: func @unroll_all_thread func.func @unroll_all_thread() { - %0 = arith.constant 0 :index - %1 = arith.constant 2 : index + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) - threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { %threadid = gpu.thread_id x - %4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index { - %3 = arith.addi %arg, %0 : index - affine.yield %3 : index + affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index @@ -301,15 +301,15 @@ func.func @unroll_all_thread() { // UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 func.func @partial_unroll_factor_4() { - %0 = arith.constant 0 :index - %1 = arith.constant 2 : index + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) - threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { %threadid = gpu.thread_id x - affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index { - %3 = arith.addi %arg, %0 : index - affine.yield %3 : index + affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } gpu.terminator } @@ -769,15 +769,15 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32 // UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4 func.func @gpu_launch_unroll_by_factor_4() { - %0 = arith.constant 0 :index - %1 = arith.constant 2 : index + %c0 = arith.constant 0 :index + %c2 = arith.constant 2 : index // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1) - threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) { + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { %threadid = gpu.thread_id x - affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index { - %3 = arith.addi %arg, %0 : index - affine.yield %3 : index + affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } gpu.terminator } From e58e115e8a385615bb7793c81ee81c2369f14d19 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Wed, 9 Apr 2025 17:48:35 +0800 Subject: [PATCH 7/8] fix test. --- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 8 +++---- mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp | 5 +++- mlir/test/Dialect/Affine/unroll.mlir | 24 ++++++++++--------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 133d7c754589b..8b5b64b7092eb 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -293,10 +293,9 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { // divisors. assert(map.getNumResults() >= 1 && "expected one or more results"); std::optional gcd; - for (auto resultExpr : map.getResults()) { + for (unsigned i = 0, e = map.getResults().size(); i < e; ++i) { uint64_t thisGcd; - AffineMap subMap = - AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr); + AffineMap subMap = map.getSubMap(i); ValueBoundsConstraintSet::Variable var(subMap, operands); auto lbBound = ValueBoundsConstraintSet::computeConstantBound( mlir::presburger::BoundType::LB, var); @@ -310,7 +309,8 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { thisGcd = tripCount; } else { // Trip count is not a known constant; return its largest known divisor. - thisGcd = resultExpr.getLargestKnownDivisor(); + thisGcd = map.getResult(i).getLargestKnownDivisor(); + ; } if (gcd.has_value()) gcd = std::gcd(*gcd, thisGcd); diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index eb7232b9a97d6..84039804fa66a 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -1048,7 +1048,10 @@ LogicalResult mlir::affine::loopUnrollByFactor( } // Generate the cleanup loop if trip count isn't a multiple of unrollFactor. - if (getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { + // If the trip count has a range, a clean up loop needs to be generated. + if ((mayBeConstantTripCount && maxMayBeConstantTripCount && + *mayBeConstantTripCount != *maxMayBeConstantTripCount) || + getLargestDivisorOfTripCount(forOp) % unrollFactor != 0) { // Loops where the lower bound is a max expression or the upper bound is // a min expression and the trip count doesn't divide the unroll factor // can't be unrolled since the lower bound of the cleanup loop in such cases diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 857a25f5d8567..24df89bf8a76e 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -12,6 +12,7 @@ // UNROLL-FULL-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 1)> // UNROLL-FULL-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)> // UNROLL-FULL-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)> +// UNROLL-FULL-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 9) ceildiv 2) floordiv 4) * 8)> // SHORT-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)> @@ -22,8 +23,8 @@ // UNROLL-BY-4-DAG: [[$MAP4:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 + 3)> // UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)> // UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)> -// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)> -// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)> +// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<(d0) -> (d0)> +// UNROLL-BY-4-DAG: [[$MAP8:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)> // UNROLL-FULL-LABEL: func @loop_nest_simplest() { func.func @loop_nest_simplest() { @@ -314,12 +315,13 @@ func.func @partial_unroll_factor_4() { gpu.terminator } // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x - // UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { - // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index - // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index - // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index - // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-FULL-NEXT: affine.yield %[[SUM_3]] : index + // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM_4]] : index // UNROLL-FULL-NEXT: } return } @@ -536,7 +538,7 @@ func.func @loop_nest_operand1() { // UNROLL-BY-4-LABEL: func @loop_nest_operand2() { func.func @loop_nest_operand2() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 { -// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { +// UNROLL-BY-4-NEXT: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32 @@ -582,7 +584,7 @@ func.func @floordiv_mod_ub(%M : index, %N : index) { func.func @loop_nest_operand3() { // UNROLL-BY-4: affine.for %arg0 = 0 to 100 step 2 { affine.for %i = 0 to 100 step 2 { - // UNROLL-BY-4: affine.for %arg1 = [[$MAP11]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { + // UNROLL-BY-4: affine.for %arg1 = [[$MAP7]](%arg0) to #map{{[0-9]*}}(%arg0) step 4 { // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32 // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32 @@ -786,7 +788,7 @@ func.func @gpu_launch_unroll_by_factor_4() { // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { // UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index // UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index // UNROLL-BY-4-NEXT: } From 80243dd4fc8a77c3bc301de380a1e52da345049a Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Mon, 4 Aug 2025 07:22:53 +0000 Subject: [PATCH 8/8] use test.value_with_bounds op update test. --- .../Dialect/Affine/Analysis/LoopAnalysis.cpp | 7 +- mlir/test/Dialect/Affine/unroll.mlir | 89 ++++++++----------- 2 files changed, 39 insertions(+), 57 deletions(-) diff --git a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp index 4815600e8fa54..df1c156e22075 100644 --- a/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp @@ -60,8 +60,8 @@ class DirectedOpGraph { void printEdges() { for (auto &en : edges) { - llvm::dbgs() << *en.first << " (" << en.first << ")" - << " has " << en.second.size() << " edges:\n"; + llvm::dbgs() << *en.first << " (" << en.first << ")" << " has " + << en.second.size() << " edges:\n"; for (auto *node : en.second) { llvm::dbgs() << '\t' << *node->op << '\n'; } @@ -72,7 +72,7 @@ class DirectedOpGraph { /// A node of a directed graph between MLIR Operations to model various /// relationships. This is meant to be used internally. struct DGNode { - DGNode(Operation *op) : op(op) {}; + DGNode(Operation *op) : op(op){}; Operation *op; // Start and finish visit numbers are standard in DFS to implement things @@ -310,7 +310,6 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) { } else { // Trip count is not a known constant; return its largest known divisor. thisGcd = map.getResult(i).getLargestKnownDivisor(); - ; } if (gcd.has_value()) gcd = std::gcd(*gcd, thisGcd); diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir index 24df89bf8a76e..baffb1ba0799b 100644 --- a/mlir/test/Dialect/Affine/unroll.mlir +++ b/mlir/test/Dialect/Affine/unroll.mlir @@ -260,72 +260,59 @@ gpu.module @unroll_full { } } -// UNROLL-FULL-LABEL: func @thread_partial_execution -func.func @thread_partial_execution() { +// UNROLL-FULL-LABEL: func @bound_unroll_partial +func.func @bound_unroll_partial() { %c0 = arith.constant 0 :index - %c2 = arith.constant 2 : index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) - threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { - affine.for %iv = %tx to 3 step 2 iter_args(%arg = %c0) -> index { + %bound = test.value_with_bounds { min = 0 : index, max = 1 : index} + affine.for %iv = %bound to 3 step 2 iter_args(%arg = %c0) -> index { %sum = arith.addi %arg, %c0 : index affine.yield %sum : index - } - // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { - // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index - // UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index - // UNROLL-FULL-NEXT: } - gpu.terminator } + // UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) { + // UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index + // UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index + // UNROLL-FULL-NEXT: } return } -// UNROLL-FULL-LABEL: func @unroll_all_thread -func.func @unroll_all_thread() { +// UNROLL-FULL-LABEL: func @bound_unroll_all +func.func @bound_unroll_all() { %c0 = arith.constant 0 :index - %c2 = arith.constant 2 : index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) - threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { - %threadid = gpu.thread_id x - affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %c0) -> index { - %sum = arith.addi %arg, %c0 : index - affine.yield %sum : index - } - // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index - // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index - // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index - gpu.terminator + %bound = test.value_with_bounds { min = 0 : index, max = 1 : index} + affine.for %iv = %bound to 6 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } + // UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index + // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index return } -// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4 -func.func @partial_unroll_factor_4() { - %c0 = arith.constant 0 :index - %c2 = arith.constant 2 : index +// UNROLL-FULL-LABEL: func.func @bound_partial_unroll_factor_4 +func.func @bound_partial_unroll_factor_4() { + %c0 = arith.constant 0 :index // UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) - threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { - %threadid = gpu.thread_id x - affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %c0) -> index { - %sum = arith.addi %arg, %c0 : index - affine.yield %sum : index - } - gpu.terminator + // UNROLL-FULL: %[[Bound:.*]] = test.value_with_bounds {max = 1 : index, min = 0 : index} + %bound = test.value_with_bounds { min = 0 : index, max = 1 : index} + affine.for %iv = %bound to 9 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } - // UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x // UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index // UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index // UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index // UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[ID]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-FULL-NEXT: affine.for %{{.*}} = [[$MAP7]]()[%[[Bound]]] to 9 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { // UNROLL-FULL-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index // UNROLL-FULL-NEXT: affine.yield %[[SUM_4]] : index // UNROLL-FULL-NEXT: } return } + // SHORT-LABEL: func @loop_nest_outer_unroll() { func.func @loop_nest_outer_unroll() { // SHORT: affine.for %arg0 = 0 to 4 { @@ -769,32 +756,28 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32 return %sum : f32 } -// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4 -func.func @gpu_launch_unroll_by_factor_4() { +// UNROLL-BY-4-LABEL: func @bound_unroll_by_factor_4 +func.func @bound_unroll_by_factor_4() { %c0 = arith.constant 0 :index - %c2 = arith.constant 2 : index // UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index - gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c2) - threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c2, %sz_tz = %c2) { - %threadid = gpu.thread_id x - affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %c0) -> index { - %sum = arith.addi %arg, %c0 : index - affine.yield %sum : index - } - gpu.terminator + %bound = test.value_with_bounds { min = 0 : index, max = 1 : index} + // UNROLL-BY-4: %[[Bound:.*]] = test.value_with_bounds {max = 1 : index, min = 0 : index} + affine.for %iv = %bound to 11 step 2 iter_args(%arg = %c0) -> index { + %sum = arith.addi %arg, %c0 : index + affine.yield %sum : index } - // UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x // UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index // UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index // UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index // UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index - // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { + // UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP8]](){{\[}}%[[Bound]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) { // UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index // UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index // UNROLL-BY-4-NEXT: } return } + // UNROLL-FULL: func @unroll_zero_trip_count_case func.func @unroll_zero_trip_count_case() { // CHECK-NEXT: affine.for %{{.*}} = 0 to 0