Skip to content

Commit b64ad5f

Browse files
fabrizio-indirliaokblast
authored andcommitted
[mlir][scf] Add parallelLoopUnrollByFactors() (llvm#163806)
- In the SCF Utils, add the `parallelLoopUnrollByFactors()` function to unroll scf::ParallelOp loops according to the specified unroll factors - Add a test pass "TestParallelLoopUnrolling" and the related LIT test - Expose `mlir::parallelLoopUnrollByFactors()`, `mlir::generateUnrolledLoop()`, and `mlir::scf::computeUbMinusLb()` functions in the mlir/Dialect/SCF/Utils/Utils.h header to make them available to other passes. - In `mlir::generateUnrolledLoop()`, add also an optional `IRMapping *clonedToSrcOpsMap` argument to map the new cloned operations to their original ones. In the function body, change the default `AnnotateFn` type to `static const` to silence potential warnings about dangling references when a function_ref is assigned to a variable with automatic storage. Signed-off-by: Fabrizio Indirli <[email protected]>
1 parent 629f8cd commit b64ad5f

File tree

7 files changed

+443
-35
lines changed

7 files changed

+443
-35
lines changed

mlir/include/mlir/Dialect/SCF/Utils/Utils.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,45 @@ FailureOr<scf::ForallOp> normalizeForallOp(RewriterBase &rewriter,
221221
/// 4. Each region iter arg and result has exactly one use
222222
bool isPerfectlyNestedForLoops(MutableArrayRef<LoopLikeOpInterface> loops);
223223

224+
/// Generate unrolled copies of an scf loop's 'loopBodyBlock', with 'iterArgs'
225+
/// and 'yieldedValues' as the block arguments and yielded values of the loop.
226+
/// The content of the loop body is replicated 'unrollFactor' times, calling
227+
/// 'ivRemapFn' to remap 'iv' for each unrolled body. If specified, annotates
228+
/// the Ops in each unrolled iteration using annotateFn. If provided,
229+
/// 'clonedToSrcOpsMap' is populated with the mappings from the cloned ops to
230+
/// the original op.
231+
void generateUnrolledLoop(
232+
Block *loopBodyBlock, Value iv, uint64_t unrollFactor,
233+
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
234+
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
235+
ValueRange iterArgs, ValueRange yieldedValues,
236+
IRMapping *clonedToSrcOpsMap = nullptr);
237+
238+
/// Unroll this scf::Parallel loop by the specified unroll factors. Returns the
239+
/// unrolled loop if the unroll succeded; otherwise returns failure if the loop
240+
/// cannot be unrolled either due to restrictions or to invalid unroll factors.
241+
/// Requires positive loop bounds and step. If specified, annotates the Ops in
242+
/// each unrolled iteration by applying `annotateFn`.
243+
/// If provided, 'clonedToSrcOpsMap' is populated with the mappings from the
244+
/// cloned ops to the original op.
245+
FailureOr<scf::ParallelOp> parallelLoopUnrollByFactors(
246+
scf::ParallelOp op, ArrayRef<uint64_t> unrollFactors,
247+
RewriterBase &rewriter,
248+
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn = nullptr,
249+
IRMapping *clonedToSrcOpsMap = nullptr);
250+
251+
/// Get constant trip counts for each of the induction variables of the given
252+
/// loop operation. If any of the loop's trip counts is not constant, return an
253+
/// empty vector.
254+
llvm::SmallVector<int64_t>
255+
getConstLoopTripCounts(mlir::LoopLikeOpInterface loopOp);
256+
257+
namespace scf {
258+
/// Helper function to compute the difference between two values. This is used
259+
/// by the loop implementations to compute the trip count.
260+
std::optional<llvm::APSInt> computeUbMinusLb(Value lb, Value ub, bool isSigned);
261+
} // namespace scf
262+
224263
} // namespace mlir
225264

226265
#endif // MLIR_DIALECT_SCF_UTILS_UTILS_H_

mlir/lib/Dialect/SCF/IR/SCF.cpp

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
1616
#include "mlir/Dialect/MemRef/IR/MemRef.h"
1717
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
18+
#include "mlir/Dialect/SCF/Utils/Utils.h"
1819
#include "mlir/Dialect/Tensor/IR/Tensor.h"
1920
#include "mlir/IR/BuiltinAttributes.h"
2021
#include "mlir/IR/IRMapping.h"
@@ -111,24 +112,6 @@ static TerminatorTy verifyAndGetTerminator(Operation *op, Region &region,
111112
return nullptr;
112113
}
113114

114-
/// Helper function to compute the difference between two values. This is used
115-
/// by the loop implementations to compute the trip count.
116-
static std::optional<llvm::APSInt> computeUbMinusLb(Value lb, Value ub,
117-
bool isSigned) {
118-
llvm::APSInt diff;
119-
auto addOp = ub.getDefiningOp<arith::AddIOp>();
120-
if (!addOp)
121-
return std::nullopt;
122-
if ((isSigned && !addOp.hasNoSignedWrap()) ||
123-
(!isSigned && !addOp.hasNoUnsignedWrap()))
124-
return std::nullopt;
125-
126-
if (addOp.getLhs() != lb ||
127-
!matchPattern(addOp.getRhs(), m_ConstantInt(&diff)))
128-
return std::nullopt;
129-
return diff;
130-
}
131-
132115
//===----------------------------------------------------------------------===//
133116
// ExecuteRegionOp
134117
//===----------------------------------------------------------------------===//

mlir/lib/Dialect/SCF/Utils/Utils.cpp

Lines changed: 144 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -291,47 +291,61 @@ static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
291291
return arith::DivUIOp::create(builder, loc, sum, divisor);
292292
}
293293

294-
/// Generates unrolled copies of scf::ForOp 'loopBodyBlock', with
295-
/// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
296-
/// 'forOpIV' for each unrolled body. If specified, annotates the Ops in each
297-
/// unrolled iteration using annotateFn.
298-
static void generateUnrolledLoop(
299-
Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
294+
void mlir::generateUnrolledLoop(
295+
Block *loopBodyBlock, Value iv, uint64_t unrollFactor,
300296
function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn,
301297
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
302-
ValueRange iterArgs, ValueRange yieldedValues) {
298+
ValueRange iterArgs, ValueRange yieldedValues,
299+
IRMapping *clonedToSrcOpsMap) {
300+
301+
// Check if the op was cloned from another source op, and return it if found
302+
// (or the same op if not found)
303+
auto findOriginalSrcOp =
304+
[](Operation *op, const IRMapping &clonedToSrcOpsMap) -> Operation * {
305+
Operation *srcOp = op;
306+
// If the source op derives from another op: traverse the chain to find the
307+
// original source op
308+
while (srcOp && clonedToSrcOpsMap.contains(srcOp))
309+
srcOp = clonedToSrcOpsMap.lookup(srcOp);
310+
return srcOp;
311+
};
312+
303313
// Builder to insert unrolled bodies just before the terminator of the body of
304-
// 'forOp'.
314+
// the loop.
305315
auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
306316

307-
constexpr auto defaultAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
317+
static const auto noopAnnotateFn = [](unsigned, Operation *, OpBuilder) {};
308318
if (!annotateFn)
309-
annotateFn = defaultAnnotateFn;
319+
annotateFn = noopAnnotateFn;
310320

311321
// Keep a pointer to the last non-terminator operation in the original block
312322
// so that we know what to clone (since we are doing this in-place).
313323
Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
314324

315-
// Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
325+
// Unroll the contents of the loop body (append unrollFactor - 1 additional
326+
// copies).
316327
SmallVector<Value, 4> lastYielded(yieldedValues);
317328

318329
for (unsigned i = 1; i < unrollFactor; i++) {
319-
IRMapping operandMap;
320-
321330
// Prepare operand map.
331+
IRMapping operandMap;
322332
operandMap.map(iterArgs, lastYielded);
323333

324334
// If the induction variable is used, create a remapping to the value for
325335
// this unrolled instance.
326-
if (!forOpIV.use_empty()) {
327-
Value ivUnroll = ivRemapFn(i, forOpIV, builder);
328-
operandMap.map(forOpIV, ivUnroll);
336+
if (!iv.use_empty()) {
337+
Value ivUnroll = ivRemapFn(i, iv, builder);
338+
operandMap.map(iv, ivUnroll);
329339
}
330340

331341
// Clone the original body of 'forOp'.
332342
for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) {
333-
Operation *clonedOp = builder.clone(*it, operandMap);
343+
Operation *srcOp = &(*it);
344+
Operation *clonedOp = builder.clone(*srcOp, operandMap);
334345
annotateFn(i, clonedOp, builder);
346+
if (clonedToSrcOpsMap)
347+
clonedToSrcOpsMap->map(clonedOp,
348+
findOriginalSrcOp(srcOp, *clonedToSrcOpsMap));
335349
}
336350

337351
// Update yielded values.
@@ -1544,3 +1558,116 @@ bool mlir::isPerfectlyNestedForLoops(
15441558
}
15451559
return true;
15461560
}
1561+
1562+
std::optional<llvm::APSInt> mlir::scf::computeUbMinusLb(Value lb, Value ub,
1563+
bool isSigned) {
1564+
llvm::APSInt diff;
1565+
auto addOp = ub.getDefiningOp<arith::AddIOp>();
1566+
if (!addOp)
1567+
return std::nullopt;
1568+
if ((isSigned && !addOp.hasNoSignedWrap()) ||
1569+
(!isSigned && !addOp.hasNoUnsignedWrap()))
1570+
return std::nullopt;
1571+
1572+
if (addOp.getLhs() != lb ||
1573+
!matchPattern(addOp.getRhs(), m_ConstantInt(&diff)))
1574+
return std::nullopt;
1575+
return diff;
1576+
}
1577+
1578+
llvm::SmallVector<int64_t>
1579+
mlir::getConstLoopTripCounts(mlir::LoopLikeOpInterface loopOp) {
1580+
std::optional<SmallVector<OpFoldResult>> loBnds = loopOp.getLoopLowerBounds();
1581+
std::optional<SmallVector<OpFoldResult>> upBnds = loopOp.getLoopUpperBounds();
1582+
std::optional<SmallVector<OpFoldResult>> steps = loopOp.getLoopSteps();
1583+
if (!loBnds || !upBnds || !steps)
1584+
return {};
1585+
llvm::SmallVector<int64_t> tripCounts;
1586+
for (auto [lb, ub, step] : llvm::zip(*loBnds, *upBnds, *steps)) {
1587+
std::optional<llvm::APInt> numIter = constantTripCount(
1588+
lb, ub, step, /*isSigned=*/true, scf::computeUbMinusLb);
1589+
if (!numIter)
1590+
return {};
1591+
tripCounts.push_back(numIter->getSExtValue());
1592+
}
1593+
return tripCounts;
1594+
}
1595+
1596+
FailureOr<scf::ParallelOp> mlir::parallelLoopUnrollByFactors(
1597+
scf::ParallelOp op, ArrayRef<uint64_t> unrollFactors,
1598+
RewriterBase &rewriter,
1599+
function_ref<void(unsigned, Operation *, OpBuilder)> annotateFn,
1600+
IRMapping *clonedToSrcOpsMap) {
1601+
const unsigned numLoops = op.getNumLoops();
1602+
assert(llvm::none_of(unrollFactors, [](uint64_t f) { return f == 0; }) &&
1603+
"Expected positive unroll factors");
1604+
assert((!unrollFactors.empty() && (unrollFactors.size() <= numLoops)) &&
1605+
"Expected non-empty unroll factors of size <= to the number of loops");
1606+
1607+
// Bail out if no valid unroll factors were provided
1608+
if (llvm::all_of(unrollFactors, [](uint64_t f) { return f == 1; }))
1609+
return rewriter.notifyMatchFailure(
1610+
op, "Unrolling not applied if all factors are 1");
1611+
1612+
// Return if the loop body is empty.
1613+
if (llvm::hasSingleElement(op.getBody()->getOperations()))
1614+
return rewriter.notifyMatchFailure(op, "Cannot unroll an empty loop body");
1615+
1616+
// If the provided unroll factors do not cover all the loop dims, they are
1617+
// applied to the inner loop dimensions.
1618+
const unsigned firstLoopDimIdx = numLoops - unrollFactors.size();
1619+
1620+
// Make sure that the unroll factors divide the iteration space evenly
1621+
// TODO: Support unrolling loops with dynamic iteration spaces.
1622+
const llvm::SmallVector<int64_t> tripCounts = getConstLoopTripCounts(op);
1623+
if (tripCounts.empty())
1624+
return rewriter.notifyMatchFailure(
1625+
op, "Failed to compute constant trip counts for the loop. Note that "
1626+
"dynamic loop sizes are not supported.");
1627+
1628+
for (unsigned dimIdx = firstLoopDimIdx; dimIdx < numLoops; dimIdx++) {
1629+
const uint64_t unrollFactor = unrollFactors[dimIdx - firstLoopDimIdx];
1630+
if (tripCounts[dimIdx] % unrollFactor)
1631+
return rewriter.notifyMatchFailure(
1632+
op, "Unroll factors don't divide the iteration space evenly");
1633+
}
1634+
1635+
std::optional<SmallVector<OpFoldResult>> maybeFoldSteps = op.getLoopSteps();
1636+
if (!maybeFoldSteps)
1637+
return rewriter.notifyMatchFailure(op, "Failed to retrieve loop steps");
1638+
llvm::SmallVector<size_t> steps{};
1639+
for (auto step : *maybeFoldSteps)
1640+
steps.push_back(static_cast<size_t>(*getConstantIntValue(step)));
1641+
1642+
for (unsigned dimIdx = firstLoopDimIdx; dimIdx < numLoops; dimIdx++) {
1643+
const uint64_t unrollFactor = unrollFactors[dimIdx - firstLoopDimIdx];
1644+
if (unrollFactor == 1)
1645+
continue;
1646+
const size_t origStep = steps[dimIdx];
1647+
const int64_t newStep = origStep * unrollFactor;
1648+
IRMapping clonedToSrcOpsMap;
1649+
1650+
ValueRange iterArgs = ValueRange(op.getRegionIterArgs());
1651+
auto yieldedValues = op.getBody()->getTerminator()->getOperands();
1652+
1653+
generateUnrolledLoop(
1654+
op.getBody(), op.getInductionVars()[dimIdx], unrollFactor,
1655+
[&](unsigned i, Value iv, OpBuilder b) {
1656+
// iv' = iv + step * i;
1657+
const AffineExpr expr = b.getAffineDimExpr(0) + (origStep * i);
1658+
const auto map =
1659+
b.getDimIdentityMap().dropResult(0).insertResult(expr, 0);
1660+
return affine::AffineApplyOp::create(b, iv.getLoc(), map,
1661+
ValueRange{iv});
1662+
},
1663+
/*annotateFn*/ annotateFn, iterArgs, yieldedValues, &clonedToSrcOpsMap);
1664+
1665+
// Update loop step
1666+
auto prevInsertPoint = rewriter.saveInsertionPoint();
1667+
rewriter.setInsertionPoint(op);
1668+
op.getStepMutable()[dimIdx].assign(
1669+
arith::ConstantIndexOp::create(rewriter, op.getLoc(), newStep));
1670+
rewriter.restoreInsertionPoint(prevInsertPoint);
1671+
}
1672+
return op;
1673+
}

0 commit comments

Comments
 (0)