Skip to content
Open
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/Affine/Analysis/LoopAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ void getTripCountMapAndOperands(AffineForOp forOp, AffineMap *map,
/// constant trip count in non-trivial cases.
std::optional<uint64_t> getConstantTripCount(AffineForOp forOp);

/// Returns the maximum trip count when the operand of forOp has a range. If the
/// operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t> getUpperBoundOnTripCount(AffineForOp forOp);

/// Returns the greatest known integral divisor of the trip count. Affine
/// expression analysis is used (indirectly through getTripCount), and
/// this method is thus able to determine non-trivial divisors.
Expand Down
112 changes: 98 additions & 14 deletions mlir/lib/Dialect/Affine/Analysis/LoopAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@

#include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"

#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
#include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
#include "mlir/Dialect/Affine/Analysis/AffineStructures.h"
#include "mlir/Dialect/Affine/Analysis/NestedMatcher.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "llvm/Support/MathExtras.h"

#include "llvm/ADT/DenseSet.h"
Expand All @@ -30,6 +33,7 @@

using namespace mlir;
using namespace mlir::affine;
using namespace mlir::dataflow;

#define DEBUG_TYPE "affine-loop-analysis"

Expand Down Expand Up @@ -84,6 +88,73 @@ void mlir::affine::getTripCountMapAndOperands(
tripCountValueMap.getOperands().end());
}

/// By running `IntegerRangeAnalysis` to get the ranges of operand, then fill
/// the `symReplacements` with range. If `replaceByMin` is set to true,
/// construct `replacement` using the smallest value.By default, the largest
/// value will be used for constructing `replacement`.
static void replaceOperandByRange(AffineForOp forOp,
SmallVectorImpl<Value> &operands,
SmallVectorImpl<AffineExpr> &symReplacements,
unsigned numDim, bool replaceByMin = false) {
DataFlowSolver solver;
solver.load<DeadCodeAnalysis>();
solver.load<IntegerRangeAnalysis>();
if (failed(solver.initializeAndRun(
forOp->getParentOfType<FunctionOpInterface>())))
return;

// `b` is used to create affineExpr
Builder b(forOp.getContext());
for (unsigned i = numDim, e = operands.size(); i < e; ++i) {
Value operand = operands[i];
auto lattice =
solver.lookupState<dataflow::IntegerValueRangeLattice>(operand);
if (!lattice) {
symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}

if (lattice->getValue().isUninitialized()) {
symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}

ConstantIntRanges range = lattice->getValue().getValue();
APInt max = range.smax();
APInt min = range.smin();
unsigned bitNums = max.getBitWidth();

if (APInt::getSignedMaxValue(bitNums) == max &&
APInt::getSignedMinValue(bitNums) == min) {
symReplacements.push_back(b.getAffineSymbolExpr(i - numDim));
continue;
}

if (!replaceByMin)
symReplacements.push_back(b.getAffineConstantExpr(max.getZExtValue()));
else
symReplacements.push_back(b.getAffineConstantExpr(min.getZExtValue()));
}
return;
}

/// Take the min if all trip counts are constant.
static std::optional<uint64_t>
getConstantTripCountFromAffineMap(AffineMap map) {
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr);
if (!constExpr)
return std::nullopt;
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
}
return tripCount;
}

/// Returns the trip count of the loop if it's a constant, std::nullopt
/// otherwise. This method uses affine expression analysis (in turn using
/// getTripCount) and is able to determine constant trip count in non-trivial
Expand All @@ -95,20 +166,30 @@ std::optional<uint64_t> mlir::affine::getConstantTripCount(AffineForOp forOp) {

if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
return getConstantTripCountFromAffineMap(map);
}

// Take the min if all trip counts are constant.
std::optional<uint64_t> tripCount;
for (auto resultExpr : map.getResults()) {
if (auto constExpr = dyn_cast<AffineConstantExpr>(resultExpr)) {
if (tripCount.has_value())
tripCount =
std::min(*tripCount, static_cast<uint64_t>(constExpr.getValue()));
else
tripCount = constExpr.getValue();
} else
return std::nullopt;
}
return tripCount;
/// Returns the maximum trip count when the operand of forOp has a range. If the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So what this does internally is compute an upper bound on each expression "potential upper bound - single lower bound" and take a minimum of that. Can you provide a mathematical justification as to why this provides a correct (and tight?) upper bound?

/// operand of forOp is a constant, the return value is the same as
/// `getConstantTripCount`.
std::optional<uint64_t>
mlir::affine::getUpperBoundOnTripCount(AffineForOp forOp) {
SmallVector<Value, 4> operands;
AffineMap map;
getTripCountMapAndOperands(forOp, &map, &operands);

if (!map)
return std::nullopt;
SmallVector<AffineExpr, 4> symReplacements;
replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims(),
true);
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
return getConstantTripCountFromAffineMap(map);
}

/// Returns the greatest known integral divisor of the trip count. Affine
Expand All @@ -121,7 +202,10 @@ uint64_t mlir::affine::getLargestDivisorOfTripCount(AffineForOp forOp) {

if (!map)
return 1;

SmallVector<AffineExpr, 4> symReplacements;
replaceOperandByRange(forOp, operands, symReplacements, map.getNumDims());
map = map.replaceDimsAndSymbols({}, symReplacements, map.getNumDims(),
map.getNumSymbols());
// The largest divisor of the trip count is the GCD of the individual largest
// divisors.
assert(map.getNumResults() >= 1 && "expected one or more results");
Expand Down
35 changes: 24 additions & 11 deletions mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,8 @@ static void replaceIterArgsAndYieldResults(AffineForOp forOp) {
/// was known to have a single iteration.
LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
std::optional<uint64_t> tripCount = getConstantTripCount(forOp);
if (!tripCount || *tripCount != 1)
std::optional<uint64_t> maxTripCount = getUpperBoundOnTripCount(forOp);
if (!tripCount || *tripCount != 1 || !maxTripCount || *maxTripCount != 1)
return failure();

// TODO: extend this for arbitrary affine bounds.
Expand Down Expand Up @@ -160,7 +161,8 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
forOp.getBody()->back().erase();
parentBlock->getOperations().splice(Block::iterator(forOp),
forOp.getBody()->getOperations());
forOp.erase();
IRRewriter b(forOp.getContext());
b.eraseOp(forOp);
return success();
}

Expand Down Expand Up @@ -884,15 +886,23 @@ void mlir::affine::getTileableBands(
/// Unrolls this loop completely.
LogicalResult mlir::affine::loopUnrollFull(AffineForOp forOp) {
std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
if (mayBeConstantTripCount.has_value()) {
uint64_t tripCount = *mayBeConstantTripCount;
if (tripCount == 0)
return success();
if (tripCount == 1)
return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}
return failure();
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);

if (!mayBeConstantTripCount.has_value() &&
!maxMayBeConstantTripCount.has_value())
return failure();

uint64_t tripCount = *mayBeConstantTripCount;
uint64_t maxTripCount = *maxMayBeConstantTripCount;

// Trip equals 0, this loop cannot unroll.
if (tripCount <= 0)
return success();

if (tripCount == 1 && maxTripCount == 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the trip count is known to be one, how can the max trip count be anything other than one?!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe maxTripCount will be equal to 2.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would getConstantMaxTripCount return a value different from the constant trip count when the trip count is known to be so? It shouldn't - otherwise, it's trivially loose.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are obviously talking about the CPU, which is indeed constant, but for hardware like GPU, threadId is a dynamic thing. The smallest threadid is 0, and the largest threadid is blocksize -1. The value of (upper - thread) / step is obviously not constant.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please look at the comments below, I'm wondering if affine-loop-unroll is not a pattern pass causing this issue (if you have the time. I'll continue to work on it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it can run it will definitely be a huge improvement, it's really exciting.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A lot of this confusion would be cleared up if tripCount were minTripCount

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it would. The semantics of affine loops is to take a minimum of values produced by each individual expression in the upper bound, it's unclear to me why we would need to reason about the upper bound.

return promoteIfSingleIteration(forOp);
return loopUnrollByFactor(forOp, tripCount);
}

/// Unrolls this loop by the specified factor or by the trip count (if constant)
Expand Down Expand Up @@ -1013,8 +1023,11 @@ LogicalResult mlir::affine::loopUnrollByFactor(
assert(unrollFactor > 0 && "unroll factor should be positive");

std::optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(forOp);
std::optional<uint64_t> maxMayBeConstantTripCount =
getUpperBoundOnTripCount(forOp);
if (unrollFactor == 1) {
if (mayBeConstantTripCount && *mayBeConstantTripCount == 1 &&
maxMayBeConstantTripCount && *maxMayBeConstantTripCount == 1 &&
failed(promoteIfSingleIteration(forOp)))
return failure();
return success();
Expand Down
34 changes: 21 additions & 13 deletions mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,26 +250,34 @@ void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
SetIntRangeFn setResultRange) {
auto setRange = [&](const ConstantIntRanges &argRange, Value dimResult,
Value idxResult) {
Value idxResult, Value size) {
if (argRange.umin().getBitWidth() != IndexType::kInternalStorageBitWidth)
return;
ConstantIntRanges dimRange =
argRange.intersection(getIndexRange(1, kMaxDim));
setResultRange(dimResult, dimRange);
ConstantIntRanges idxRange =
getIndexRange(0, dimRange.umax().getZExtValue() - 1);
setResultRange(idxResult, idxRange);
APInt sizeInt;
if (matchPattern(size, m_ConstantInt(&sizeInt))) {
ConstantIntRanges dimRange = ConstantIntRanges::constant(sizeInt);
setResultRange(dimResult, dimRange);
ConstantIntRanges idxRange = getIndexRange(0, sizeInt.getZExtValue() - 1);
setResultRange(idxResult, idxRange);
} else {
ConstantIntRanges dimRange =
argRange.intersection(getIndexRange(1, kMaxDim));
setResultRange(dimResult, dimRange);
ConstantIntRanges idxRange =
getIndexRange(0, dimRange.umax().getZExtValue() - 1);
setResultRange(idxResult, idxRange);
}
};

argRanges = argRanges.drop_front(getAsyncDependencies().size());
KernelDim3 gridDims = getGridSize();
KernelDim3 blockIds = getBlockIds();
setRange(argRanges[0], gridDims.x, blockIds.x);
setRange(argRanges[1], gridDims.y, blockIds.y);
setRange(argRanges[2], gridDims.z, blockIds.z);
setRange(argRanges[0], gridDims.x, blockIds.x, getGridSizeX());
setRange(argRanges[1], gridDims.y, blockIds.y, getGridSizeY());
setRange(argRanges[2], gridDims.z, blockIds.z, getGridSizeZ());
KernelDim3 blockDims = getBlockSize();
KernelDim3 threadIds = getThreadIds();
setRange(argRanges[3], blockDims.x, threadIds.x);
setRange(argRanges[4], blockDims.y, threadIds.y);
setRange(argRanges[5], blockDims.z, threadIds.z);
setRange(argRanges[3], blockDims.x, threadIds.x, getBlockSizeX());
setRange(argRanges[4], blockDims.y, threadIds.y, getBlockSizeY());
setRange(argRanges[5], blockDims.z, threadIds.z, getBlockSizeZ());
}
92 changes: 92 additions & 0 deletions mlir/test/Dialect/Affine/unroll.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
// UNROLL-BY-4-DAG: [[$MAP5:#map[0-9]*]] = affine_map<(d0)[s0] -> (d0 + s0 + 1)>
// UNROLL-BY-4-DAG: [[$MAP6:#map[0-9]*]] = affine_map<(d0, d1) -> (d0 * 16 + d1)>
// UNROLL-BY-4-DAG: [[$MAP11:#map[0-9]*]] = affine_map<(d0) -> (d0)>
// UNROLL-BY-4-DAG: [[$MAP7:#map[0-9]*]] = affine_map<()[s0] -> (s0 + (((-s0 + 11) ceildiv 2) floordiv 4) * 8)>

// UNROLL-FULL-LABEL: func @loop_nest_simplest() {
func.func @loop_nest_simplest() {
Expand Down Expand Up @@ -258,6 +259,71 @@ gpu.module @unroll_full {
}
}

// UNROLL-FULL-LABEL: func @thread_partial_execution
func.func @thread_partial_execution() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we avoid using GPU dialect operations here? I suppose we have tests for the bound analysis somewhere that must be using test ops with known bounds, we could use those instead and not spuriously rely on the logic of another dialect here.

threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
affine.for %iv = %tx to 3 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
// UNROLL-FULL: affine.for %{{.*}} = %{{.*}} to 3 step 2 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM]] : index
// UNROLL-FULL-NEXT: }
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func @unroll_all_thread
func.func @unroll_all_thread() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
%4 = affine.for %iv = %threadid to 6 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
// UNROLL-FULL: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
gpu.terminator
}
return
}

// UNROLL-FULL-LABEL: func.func @partial_unroll_factor_4
func.func @partial_unroll_factor_4() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-FULL: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 9 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
gpu.terminator
}
// UNROLL-FULL: %[[ID:.*]] = gpu.thread_id x
// UNROLL-FULL-NEXT: affine.for %{{.*}} = %[[ID]] to 9 step 8 iter_args(%[[ARG:.*]] = %[[C0]]) -> (index) {
// UNROLL-FULL-NEXT: %[[SUM_0:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-FULL-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-FULL-NEXT: affine.yield %[[SUM_3]] : index
// UNROLL-FULL-NEXT: }
return
}

// SHORT-LABEL: func @loop_nest_outer_unroll() {
func.func @loop_nest_outer_unroll() {
// SHORT: affine.for %arg0 = 0 to 4 {
Expand Down Expand Up @@ -701,6 +767,32 @@ func.func @unroll_with_iter_args_and_promotion(%arg0 : f32, %arg1 : f32) -> f32
return %sum : f32
}

// UNROLL-BY-4-LABEL: func @gpu_launch_unroll_by_factor_4
func.func @gpu_launch_unroll_by_factor_4() {
%0 = arith.constant 0 :index
%1 = arith.constant 2 : index
// UNROLL-BY-4: %[[C0:.*]] = arith.constant 0 : index
gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %1, %sz_by = %1, %sz_bz = %1)
threads(%tx, %ty, %tz) in (%sz_tx = %1, %sz_ty = %1, %sz_tz = %1) {
%threadid = gpu.thread_id x
affine.for %iv = %threadid to 11 step 2 iter_args(%arg = %0) -> index {
%3 = arith.addi %arg, %0 : index
affine.yield %3 : index
}
gpu.terminator
}
// UNROLL-BY-4: %[[ID:.*]] = gpu.thread_id x
// UNROLL-BY-4-NEXT: %[[SUM_0:.*]] = arith.addi %[[C0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_1:.*]] = arith.addi %[[SUM_0]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_2:.*]] = arith.addi %[[SUM_1]], %[[C0]] : index
// UNROLL-BY-4-NEXT: %[[SUM_3:.*]] = arith.addi %[[SUM_2]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.for %[[VAL_20:.*]] = [[$MAP7]](){{\[}}%[[ID]]] to 11 step 2 iter_args(%[[ARG:.*]] = %[[SUM_3]]) -> (index) {
// UNROLL-BY-4-NEXT: %[[SUM_4:.*]] = arith.addi %[[ARG]], %[[C0]] : index
// UNROLL-BY-4-NEXT: affine.yield %[[SUM_4]] : index
// UNROLL-BY-4-NEXT: }
return
}

// UNROLL-FULL: func @unroll_zero_trip_count_case
func.func @unroll_zero_trip_count_case() {
// CHECK-NEXT: affine.for %{{.*}} = 0 to 0
Expand Down