Skip to content

Commit 83a48ce

Browse files
committed
[MLIR] Create GPU utils library & move distribution utils
1 parent 3d6b2d4 commit 83a48ce

File tree

13 files changed

+230
-145
lines changed

13 files changed

+230
-145
lines changed

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
99
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
1010

11-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
11+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1212
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
1313
#include "mlir/IR/Builders.h"
1414
#include "mlir/IR/Types.h"

mlir/include/mlir/Dialect/GPU/Transforms/Passes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
1414
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
1515

16-
#include "Utils.h"
16+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1717
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1818
#include "mlir/IR/PatternMatch.h"
1919
#include "mlir/Pass/Pass.h"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
//===- VectorDistributionUtils.h - Distribution Utilities -------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
10+
#define MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBITIONUTILS_H_
11+
12+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
13+
#include "mlir/IR/PatternMatch.h"
14+
15+
#include <utility>
16+
17+
namespace mlir {
18+
namespace gpu {
19+
/// Return a value yielded by `warpOp` which statifies the filter lamdba
20+
/// condition and is not dead.
21+
OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
22+
const std::function<bool(Operation *)> &fn);
23+
24+
/// Helper to create a new WarpExecuteOnLane0Op with different signature.
25+
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
26+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
27+
ValueRange newYieldedValues, TypeRange newReturnTypes);
28+
29+
/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
30+
/// `indices` return the index of each new output.
31+
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
32+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
33+
ValueRange newYieldedValues, TypeRange newReturnTypes,
34+
llvm::SmallVector<size_t> &indices);
35+
36+
/// Helper to know if an op can be hoisted out of the region.
37+
bool canBeHoisted(Operation *op, function_ref<bool(Value)> definedOutside);
38+
39+
/// Return a value yielded by `warpOp` which statifies the filter lamdba
40+
/// condition and is not dead.
41+
OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
42+
const std::function<bool(Operation *)> &fn);
43+
44+
/// Delinearize the given `laneId` into multiple dimensions, where each
45+
/// dimension's size is determined by `originalShape` and `distributedShape`
46+
/// together. This function expects the total numbers of threads needed for
47+
/// distribution is equal to `warpSize`. Returns true and updates
48+
/// `delinearizedIds` if so.
49+
bool delinearizeLaneId(OpBuilder &builder, Location loc,
50+
ArrayRef<int64_t> originalShape,
51+
ArrayRef<int64_t> distributedShape, int64_t warpSize,
52+
Value laneId, SmallVectorImpl<Value> &delinearizedIds);
53+
54+
} // namespace gpu
55+
} // namespace mlir
56+
57+
#endif // MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_

mlir/lib/Dialect/GPU/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ add_mlir_dialect_library(MLIRGPUTransforms
4040
Transforms/ShuffleRewriter.cpp
4141
Transforms/SPIRVAttachTarget.cpp
4242
Transforms/SubgroupReduceLowering.cpp
43-
Transforms/Utils.cpp
4443

4544
OBJECT
4645

@@ -59,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
5958
MLIRDataLayoutInterfaces
6059
MLIRExecutionEngineUtils
6160
MLIRGPUDialect
61+
MLIRGPUUtils
6262
MLIRIR
6363
MLIRIndexDialect
6464
MLIRLLVMDialect
@@ -76,3 +76,4 @@ add_mlir_dialect_library(MLIRGPUTransforms
7676

7777
add_subdirectory(TransformOps)
7878
add_subdirectory(Pipelines)
79+
add_subdirectory(Utils)

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "mlir/Dialect/Async/IR/Async.h"
1717
#include "mlir/Dialect/Func/IR/FuncOps.h"
1818
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
19+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
2020
#include "mlir/IR/Builders.h"
2121
#include "mlir/IR/IRMapping.h"
2222
#include "mlir/IR/PatternMatch.h"

mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include "mlir/Dialect/DLTI/DLTI.h"
1919
#include "mlir/Dialect/Func/IR/FuncOps.h"
2020
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
21-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
21+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
2222
#include "mlir/Dialect/MemRef/IR/MemRef.h"
2323
#include "mlir/IR/Builders.h"
2424
#include "mlir/IR/BuiltinAttributes.h"

mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include "mlir/Dialect/Arith/IR/Arith.h"
1414
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1515
#include "mlir/Dialect/GPU/Transforms/Passes.h"
16-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
16+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1717
#include "mlir/Dialect/Vector/IR/VectorOps.h"
1818
#include "mlir/IR/BuiltinTypes.h"
1919
#include "mlir/IR/Location.h"
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
add_mlir_dialect_library(MLIRGPUUtils
2+
Utils.cpp
3+
DistributionUtils.cpp
4+
5+
ADDITIONAL_HEADER_DIRS
6+
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
7+
8+
LINK_LIBS PUBLIC
9+
MLIRArithDialect
10+
MLIRAffineDialect
11+
MLIRGPUDialect
12+
MLIRSupport
13+
MLIRIR
14+
)
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
2+
//
3+
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements distribution utility methods.
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
14+
#include "mlir/Dialect/Affine/IR/AffineOps.h"
15+
#include "mlir/Dialect/Arith/IR/Arith.h"
16+
#include "mlir/IR/Value.h"
17+
18+
#include <numeric>
19+
20+
using namespace mlir;
21+
using namespace mlir::gpu;
22+
23+
WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndReplaceReturns(
24+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
25+
ValueRange newYieldedValues, TypeRange newReturnTypes) {
26+
// Create a new op before the existing one, with the extra operands.
27+
OpBuilder::InsertionGuard g(rewriter);
28+
rewriter.setInsertionPoint(warpOp);
29+
auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
30+
warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
31+
warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
32+
33+
Region &opBody = warpOp.getBodyRegion();
34+
Region &newOpBody = newWarpOp.getBodyRegion();
35+
Block &newOpFirstBlock = newOpBody.front();
36+
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
37+
rewriter.eraseBlock(&newOpFirstBlock);
38+
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
39+
"expected WarpOp with single block");
40+
41+
auto yield =
42+
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
43+
44+
rewriter.modifyOpInPlace(
45+
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
46+
return newWarpOp;
47+
}
48+
49+
WarpExecuteOnLane0Op mlir::gpu::moveRegionToNewWarpOpAndAppendReturns(
50+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
51+
ValueRange newYieldedValues, TypeRange newReturnTypes,
52+
llvm::SmallVector<size_t> &indices) {
53+
SmallVector<Type> types(warpOp.getResultTypes().begin(),
54+
warpOp.getResultTypes().end());
55+
auto yield = cast<gpu::YieldOp>(
56+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
57+
llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
58+
yield.getOperands().end());
59+
for (auto newRet : llvm::zip(newYieldedValues, newReturnTypes)) {
60+
if (yieldValues.insert(std::get<0>(newRet))) {
61+
types.push_back(std::get<1>(newRet));
62+
indices.push_back(yieldValues.size() - 1);
63+
} else {
64+
// If the value already exit the region don't create a new output.
65+
for (auto [idx, yieldOperand] :
66+
llvm::enumerate(yieldValues.getArrayRef())) {
67+
if (yieldOperand == std::get<0>(newRet)) {
68+
indices.push_back(idx);
69+
break;
70+
}
71+
}
72+
}
73+
}
74+
yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
75+
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
76+
rewriter, warpOp, yieldValues.getArrayRef(), types);
77+
rewriter.replaceOp(warpOp,
78+
newWarpOp.getResults().take_front(warpOp.getNumResults()));
79+
return newWarpOp;
80+
}
81+
82+
bool mlir::gpu::canBeHoisted(Operation *op,
83+
function_ref<bool(Value)> definedOutside) {
84+
return llvm::all_of(op->getOperands(), definedOutside) &&
85+
isMemoryEffectFree(op) && op->getNumRegions() == 0;
86+
}
87+
88+
OpOperand *
89+
mlir::gpu::getWarpResult(WarpExecuteOnLane0Op warpOp,
90+
const std::function<bool(Operation *)> &fn) {
91+
auto yield = cast<gpu::YieldOp>(
92+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
93+
for (OpOperand &yieldOperand : yield->getOpOperands()) {
94+
Value yieldValues = yieldOperand.get();
95+
Operation *definedOp = yieldValues.getDefiningOp();
96+
if (definedOp && fn(definedOp)) {
97+
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
98+
return &yieldOperand;
99+
}
100+
}
101+
return {};
102+
}
103+
104+
bool mlir::gpu::delinearizeLaneId(OpBuilder &builder, Location loc,
105+
ArrayRef<int64_t> originalShape,
106+
ArrayRef<int64_t> distributedShape,
107+
int64_t warpSize, Value laneId,
108+
SmallVectorImpl<Value> &delinearizedIds) {
109+
// If the original shape and the distributed shape is the same, we don't
110+
// distribute at all--every thread is handling the whole. For such case, we
111+
// should not rely on lane IDs later. So just return an empty lane ID vector.
112+
if (originalShape == distributedShape) {
113+
delinearizedIds.clear();
114+
return true;
115+
}
116+
117+
SmallVector<int64_t> sizes;
118+
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
119+
if (large % small != 0)
120+
return false;
121+
sizes.push_back(large / small);
122+
}
123+
if (std::accumulate(sizes.begin(), sizes.end(), 1,
124+
std::multiplies<int64_t>()) != warpSize)
125+
return false;
126+
127+
AffineExpr s0, s1;
128+
bindSymbols(builder.getContext(), s0, s1);
129+
130+
int64_t usedThreads = 1;
131+
132+
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
133+
delinearizedIds.assign(sizes.size(), zero);
134+
135+
for (int i = sizes.size() - 1; i >= 0; --i) {
136+
usedThreads *= sizes[i];
137+
if (usedThreads == warpSize) {
138+
// We've used up all available threads. Don't need to perform modulo
139+
// anymore. And we can stop the calculation for further dimensions.
140+
delinearizedIds[i] = laneId;
141+
break;
142+
}
143+
delinearizedIds[i] =
144+
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
145+
laneId = affine::makeComposedAffineApply(
146+
builder, loc, s0.floorDiv(usedThreads), {laneId});
147+
}
148+
return true;
149+
}

0 commit comments

Comments
 (0)