Skip to content

Commit ec5015f

Browse files
committed
get rid of template parameter
1 parent e5b53de commit ec5015f

File tree

3 files changed

+147
-164
lines changed

3 files changed

+147
-164
lines changed

mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h

Lines changed: 2 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@ namespace gpu {
2424
/// region.
2525
void moveScalarUniformCode(gpu::WarpExecuteOnLane0Op op);
2626

27-
template <typename T>
2827
struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
2928
using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
3029
virtual LogicalResult
31-
matchAndRewrite(T op, PatternRewriter &rewriter) const override = 0;
30+
matchAndRewrite(WarpExecuteOnLane0Op op,
31+
PatternRewriter &rewriter) const override = 0;
3232

3333
protected:
3434
/// Return a value yielded by `warpOp` which statifies the filter lamdba
@@ -60,132 +60,6 @@ struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
6060
SmallVectorImpl<Value> &delinearizedIds);
6161
};
6262

63-
template <typename T>
64-
WarpExecuteOnLane0Op
65-
WarpDistributionPattern<T>::moveRegionToNewWarpOpAndReplaceReturns(
66-
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
67-
ValueRange newYieldedValues, TypeRange newReturnTypes) {
68-
// Create a new op before the existing one, with the extra operands.
69-
OpBuilder::InsertionGuard g(rewriter);
70-
rewriter.setInsertionPoint(warpOp);
71-
auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
72-
warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
73-
warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
74-
75-
Region &opBody = warpOp.getBodyRegion();
76-
Region &newOpBody = newWarpOp.getBodyRegion();
77-
Block &newOpFirstBlock = newOpBody.front();
78-
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
79-
rewriter.eraseBlock(&newOpFirstBlock);
80-
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
81-
"expected WarpOp with single block");
82-
83-
auto yield =
84-
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
85-
86-
rewriter.modifyOpInPlace(
87-
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
88-
return newWarpOp;
89-
}
90-
91-
template <typename T>
92-
WarpExecuteOnLane0Op
93-
WarpDistributionPattern<T>::moveRegionToNewWarpOpAndAppendReturns(
94-
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
95-
ValueRange newYieldedValues, TypeRange newReturnTypes,
96-
llvm::SmallVector<size_t> &indices) {
97-
SmallVector<Type> types(warpOp.getResultTypes().begin(),
98-
warpOp.getResultTypes().end());
99-
auto yield = cast<gpu::YieldOp>(
100-
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
101-
llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
102-
yield.getOperands().end());
103-
for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
104-
if (yieldValues.insert(std::get<0>(newRet))) {
105-
types.push_back(std::get<1>(newRet));
106-
indices.push_back(yieldValues.size() - 1);
107-
} else {
108-
// If the value already exit the region don't create a new output.
109-
for (auto [idx, yieldOperand] :
110-
llvm::enumerate(yieldValues.getArrayRef())) {
111-
if (yieldOperand == std::get<0>(newRet)) {
112-
indices.push_back(idx);
113-
break;
114-
}
115-
}
116-
}
117-
}
118-
yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
119-
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
120-
rewriter, warpOp, yieldValues.getArrayRef(), types);
121-
rewriter.replaceOp(warpOp,
122-
newWarpOp.getResults().take_front(warpOp.getNumResults()));
123-
return newWarpOp;
124-
}
125-
126-
template <typename T>
127-
OpOperand *WarpDistributionPattern<T>::getWarpResult(
128-
WarpExecuteOnLane0Op warpOp, const std::function<bool(Operation *)> &fn) {
129-
auto yield = cast<gpu::YieldOp>(
130-
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
131-
for (OpOperand &yieldOperand : yield->getOpOperands()) {
132-
Value yieldValues = yieldOperand.get();
133-
Operation *definedOp = yieldValues.getDefiningOp();
134-
if (definedOp && fn(definedOp)) {
135-
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
136-
return &yieldOperand;
137-
}
138-
}
139-
return {};
140-
}
141-
142-
template <typename T>
143-
bool WarpDistributionPattern<T>::delinearizeLaneId(
144-
OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
145-
ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
146-
SmallVectorImpl<Value> &delinearizedIds) {
147-
// If the original shape and the distributed shape is the same, we don't
148-
// distribute at all--every thread is handling the whole. For such case, we
149-
// should not rely on lane IDs later. So just return an empty lane ID vector.
150-
if (originalShape == distributedShape) {
151-
delinearizedIds.clear();
152-
return true;
153-
}
154-
155-
SmallVector<int64_t> sizes;
156-
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
157-
if (large % small != 0)
158-
return false;
159-
sizes.push_back(large / small);
160-
}
161-
if (std::accumulate(sizes.begin(), sizes.end(), 1,
162-
std::multiplies<int64_t>()) != warpSize)
163-
return false;
164-
165-
AffineExpr s0, s1;
166-
bindSymbols(builder.getContext(), s0, s1);
167-
168-
int64_t usedThreads = 1;
169-
170-
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
171-
delinearizedIds.assign(sizes.size(), zero);
172-
173-
for (int i = sizes.size() - 1; i >= 0; --i) {
174-
usedThreads *= sizes[i];
175-
if (usedThreads == warpSize) {
176-
// We've used up all available threads. Don't need to perform modulo
177-
// anymore. And we can stop the calculation for further dimensions.
178-
delinearizedIds[i] = laneId;
179-
break;
180-
}
181-
delinearizedIds[i] =
182-
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
183-
laneId = affine::makeComposedAffineApply(
184-
builder, loc, s0.floorDiv(usedThreads), {laneId});
185-
}
186-
return true;
187-
}
188-
18963
} // namespace gpu
19064
} // namespace mlir
19165

mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,125 @@
1919

2020
using namespace mlir;
2121
using namespace mlir::gpu;
22+
23+
WarpExecuteOnLane0Op
24+
WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
25+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
26+
ValueRange newYieldedValues, TypeRange newReturnTypes) {
27+
// Create a new op before the existing one, with the extra operands.
28+
OpBuilder::InsertionGuard g(rewriter);
29+
rewriter.setInsertionPoint(warpOp);
30+
auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
31+
warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
32+
warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
33+
34+
Region &opBody = warpOp.getBodyRegion();
35+
Region &newOpBody = newWarpOp.getBodyRegion();
36+
Block &newOpFirstBlock = newOpBody.front();
37+
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
38+
rewriter.eraseBlock(&newOpFirstBlock);
39+
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
40+
"expected WarpOp with single block");
41+
42+
auto yield =
43+
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
44+
45+
rewriter.modifyOpInPlace(
46+
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
47+
return newWarpOp;
48+
}
49+
50+
WarpExecuteOnLane0Op
51+
WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns(
52+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
53+
ValueRange newYieldedValues, TypeRange newReturnTypes,
54+
llvm::SmallVector<size_t> &indices) {
55+
SmallVector<Type> types(warpOp.getResultTypes().begin(),
56+
warpOp.getResultTypes().end());
57+
auto yield = cast<gpu::YieldOp>(
58+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
59+
llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
60+
yield.getOperands().end());
61+
for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
62+
if (yieldValues.insert(std::get<0>(newRet))) {
63+
types.push_back(std::get<1>(newRet));
64+
indices.push_back(yieldValues.size() - 1);
65+
} else {
66+
// If the value already exit the region don't create a new output.
67+
for (auto [idx, yieldOperand] :
68+
llvm::enumerate(yieldValues.getArrayRef())) {
69+
if (yieldOperand == std::get<0>(newRet)) {
70+
indices.push_back(idx);
71+
break;
72+
}
73+
}
74+
}
75+
}
76+
yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
77+
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
78+
rewriter, warpOp, yieldValues.getArrayRef(), types);
79+
rewriter.replaceOp(warpOp,
80+
newWarpOp.getResults().take_front(warpOp.getNumResults()));
81+
return newWarpOp;
82+
}
83+
84+
OpOperand *WarpDistributionPattern::getWarpResult(
85+
WarpExecuteOnLane0Op warpOp, const std::function<bool(Operation *)> &fn) {
86+
auto yield = cast<gpu::YieldOp>(
87+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
88+
for (OpOperand &yieldOperand : yield->getOpOperands()) {
89+
Value yieldValues = yieldOperand.get();
90+
Operation *definedOp = yieldValues.getDefiningOp();
91+
if (definedOp && fn(definedOp)) {
92+
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
93+
return &yieldOperand;
94+
}
95+
}
96+
return {};
97+
}
98+
99+
bool WarpDistributionPattern::delinearizeLaneId(
100+
OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
101+
ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
102+
SmallVectorImpl<Value> &delinearizedIds) {
103+
// If the original shape and the distributed shape is the same, we don't
104+
// distribute at all--every thread is handling the whole. For such case, we
105+
// should not rely on lane IDs later. So just return an empty lane ID vector.
106+
if (originalShape == distributedShape) {
107+
delinearizedIds.clear();
108+
return true;
109+
}
110+
111+
SmallVector<int64_t> sizes;
112+
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
113+
if (large % small != 0)
114+
return false;
115+
sizes.push_back(large / small);
116+
}
117+
if (std::accumulate(sizes.begin(), sizes.end(), 1,
118+
std::multiplies<int64_t>()) != warpSize)
119+
return false;
120+
121+
AffineExpr s0, s1;
122+
bindSymbols(builder.getContext(), s0, s1);
123+
124+
int64_t usedThreads = 1;
125+
126+
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
127+
delinearizedIds.assign(sizes.size(), zero);
128+
129+
for (int i = sizes.size() - 1; i >= 0; --i) {
130+
usedThreads *= sizes[i];
131+
if (usedThreads == warpSize) {
132+
// We've used up all available threads. Don't need to perform modulo
133+
// anymore. And we can stop the calculation for further dimensions.
134+
delinearizedIds[i] = laneId;
135+
break;
136+
}
137+
delinearizedIds[i] =
138+
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
139+
laneId = affine::makeComposedAffineApply(
140+
builder, loc, s0.floorDiv(usedThreads), {laneId});
141+
}
142+
return true;
143+
}

0 commit comments

Comments
 (0)