Skip to content

Commit 89bb0ca

Browse files
committed
[mlir][transform] Create GPU transform dialect
This revision adds GPU transform dialect. It also introduce a prefix such as "transform.gpu" for all ops related to this dialect. MLIR already had two GPU transform op in linalg. This revision moves these ops into GPUTransformOps. The Ops are as follows: `transform.structured.map_nested_foreach_thread_to_gpu_blocks` -> `transform.gpu.map_foreach_to_blocks` This op selects the outermost (toplevel) foreach_thread and parallelize across GPU blocks. It can also generate `gpu_launch`. `transform.structured.map_nested_foreach_thread_to_gpu_threads` -> `transform.gpu.map_nested_foreach_to_threads` This op parallelizes nested foreach_thread that are inside `gpu_launch` across GPU threads. It doesn't add new functionality, but there are some minor refactoring of the code. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D134800
1 parent 491ac8f commit 89bb0ca

File tree

13 files changed

+851
-572
lines changed

13 files changed

+851
-572
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
add_subdirectory(IR)
22
add_subdirectory(Transforms)
3+
add_subdirectory(TransformOps)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
set(LLVM_TARGET_DEFINITIONS GPUTransformOps.td)
2+
mlir_tablegen(GPUTransformOps.h.inc -gen-op-decls)
3+
mlir_tablegen(GPUTransformOps.cpp.inc -gen-op-defs)
4+
add_public_tablegen_target(MLIRGPUTransformOpsIncGen)
5+
6+
add_mlir_doc(GPUTransformOps GPUTransformOps Dialects/ -gen-op-doc)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
//===- GPUTransformOps.h - GPU transform ops --------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
10+
#define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
11+
12+
#include "mlir/Dialect/PDL/IR/PDLTypes.h"
13+
#include "mlir/Dialect/SCF/IR/SCF.h"
14+
#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
15+
#include "mlir/IR/OpImplementation.h"
16+
#include "mlir/IR/PatternMatch.h"
17+
18+
namespace mlir {
19+
namespace gpu {
20+
class GpuOp;
21+
} // namespace gpu
22+
} // namespace mlir
23+
24+
//===----------------------------------------------------------------------===//
25+
// GPU Transform Operations
26+
//===----------------------------------------------------------------------===//
27+
28+
#define GET_OP_CLASSES
29+
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h.inc"
30+
31+
namespace mlir {
32+
class DialectRegistry;
33+
namespace transform {
34+
namespace gpu {
35+
36+
/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
37+
/// op to GPU threads. Mapping is one-to-one and the induction variables of
38+
/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
39+
/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
40+
/// which case, the union of the number of threads is computed and may result in
41+
/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
42+
/// supported. Dynamic block dim sizes are currently not supported.
43+
DiagnosedSilenceableFailure
44+
mapNestedForeachToThreadsImp(RewriterBase &rewriter, Operation *target,
45+
const SmallVectorImpl<int64_t> &blockDim,
46+
bool syncAfterDistribute,
47+
llvm::Optional<TransformOpInterface> transformOp);
48+
49+
/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
50+
/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
51+
/// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
52+
/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
53+
/// dim sizes are currently not supported.
54+
DiagnosedSilenceableFailure mapForeachToBlocksImp(
55+
RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
56+
function_ref<void(RewriterBase &, scf::ForeachThreadOp,
57+
SmallVectorImpl<Value> &)>
58+
blockIdGenerator,
59+
SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp);
60+
61+
/// Finds the top level scf::ForeachThreadOp of given target.
62+
DiagnosedSilenceableFailure
63+
findTopLevelForeachThreadOp(Operation *target,
64+
scf::ForeachThreadOp &topLevelForeachThreadOp,
65+
TransformOpInterface transformOp);
66+
67+
} // namespace gpu
68+
} // namespace transform
69+
70+
namespace gpu {
71+
void registerTransformDialectExtension(DialectRegistry &registry);
72+
} // namespace gpu
73+
} // namespace mlir
74+
75+
#endif // MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
//===- GPUTransformOps.td - GPU transform ops --------------*- tablegen -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef GPU_TRANSFORM_OPS
10+
#define GPU_TRANSFORM_OPS
11+
12+
include "mlir/Dialect/Transform/IR/TransformDialect.td"
13+
include "mlir/Dialect/Transform/IR/TransformEffects.td"
14+
include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
15+
include "mlir/Dialect/PDL/IR/PDLTypes.td"
16+
include "mlir/Interfaces/SideEffectInterfaces.td"
17+
include "mlir/IR/OpBase.td"
18+
19+
def MapNestedForeachToThreads :
20+
Op<Transform_Dialect, "gpu.map_nested_foreach_to_threads",
21+
[FunctionalStyleTransformOpTrait,
22+
MemoryEffectsOpInterface,
23+
TransformEachOpTrait,
24+
TransformOpInterface]> {
25+
let description = [{
26+
Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
27+
nested in it to distributed `gpu.thread_id` attribute.
28+
29+
The operation searches for `scf.foreach_thread` ops nested under `target`
30+
and maps each such op to GPU threads. Mapping is one-to-one and the
31+
induction variables of `scf.foreach_thread` are rewritten to
32+
`gpu.thread_id` according to the `thread_dim_mapping` attribute.
33+
34+
Sibling `scf.foreach_thread` are supported in which case, the union of
35+
the number of threads is computed and may result in predication.
36+
37+
Multiple scf.foreach_thread are supported per `gpu.launch` in which case,
38+
the max of all the threads is computed and taken for the global
39+
`gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the
40+
whole thread range result in predicated computations.
41+
42+
Dynamic `scf.foreach_thread` trip counts are currently not supported.
43+
Dynamic block dim sizes are currently not supported.
44+
45+
Only **bufferized** `scf.foreach_thread` are currently supported.
46+
Only `scf.foreach_thread` distributed to **at most 3 dimensions** are
47+
currently supported.
48+
49+
Barriers are inserted after each scf.foreach_thread op for now.
50+
51+
The operation alters the block size of the given gpu_launch using
52+
blockDim argument.
53+
54+
#### Return modes:
55+
56+
This operation ignores non-gpu_launch ops and drops them in the return.
57+
58+
If any scf.foreach_thread with tensors is found, the transform definitely
59+
fails.
60+
61+
If all the scf.foreach_thread operations contained within the LaunchOp
62+
referred to by the `target` PDLOperation lower to GPU properly, the
63+
transform succeeds. Otherwise the transform definitely fails.
64+
65+
The returned handle points to the same LaunchOp operand, consuming it and
66+
producing a new SSA value to satisfy chaining and linearity of the IR
67+
properties.
68+
69+
#### Example:
70+
71+
```
72+
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
73+
threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
74+
scf.foreach_thread (%i, %j) in (7, 9) {
75+
... // body 1
76+
} {thread_dim_mapping = [1, 0, 2]}
77+
scf.foreach_thread (%i) in (12) {
78+
... // body 2
79+
}
80+
gpu.terminator
81+
}
82+
```
83+
is translated to:
84+
85+
```
86+
%bdimX = arith.constant 12 : index
87+
%bdimY = arith.constant 9 : index
88+
gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
89+
threads(%tx, %ty, %tz) in (%tx = %bdimX, %ty = %bdimY, %tz = %5) {
90+
if (threadIdx.x < 9 && threadIdx.y < 7) {
91+
... // body 1
92+
}
93+
gpu.barrier
94+
if (threadIdx.y < 1) {
95+
... // body 2
96+
}
97+
gpu.barrier
98+
gpu.terminator
99+
}
100+
```
101+
}];
102+
103+
let arguments = (ins PDL_Operation:$target,
104+
DefaultValuedAttr<I64ArrayAttr, "{}">:$blockDim,
105+
DefaultValuedAttr<BoolAttr, "true">:$syncAfterDistribute);
106+
let results = (outs PDL_Operation:$result);
107+
108+
let assemblyFormat = "$target attr-dict";
109+
let extraClassDeclaration = [{
110+
::mlir::DiagnosedSilenceableFailure applyToOne(
111+
::mlir::Operation *target,
112+
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
113+
::mlir::transform::TransformState &state);
114+
}];
115+
}
116+
117+
118+
def MapForeachToBlocks :
119+
Op<Transform_Dialect, "gpu.map_foreach_to_blocks",
120+
[FunctionalStyleTransformOpTrait,
121+
MemoryEffectsOpInterface,
122+
TransformOpInterface,
123+
TransformEachOpTrait]> {
124+
let description = [{
125+
Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
126+
to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
127+
is set, then first generates `gpu_launch` and moves the top level
128+
`scf.foreach_thread` inside.
129+
130+
The operation searches top level `scf.foreach_thread` ops under
131+
`gpu_launch` and maps each such op to GPU blocks. Mapping is
132+
one-to-one and the induction variables of `scf.foreach_thread` are
133+
rewritten to gpu.block_id according to the `thread_dim_apping` attribute.
134+
135+
Dynamic, `scf.foreach_thread` trip counts are currently not supported.
136+
Dynamic block dim sizes are currently not supported.
137+
138+
Only **bufferized** scf.foreach_thread are currently supported.
139+
Only scf.foreach_thread distributed to **at most 3 dimensions** are
140+
currently supported.
141+
142+
The operation alters the block size of the given gpu_launch using
143+
gridDim argument.
144+
145+
#### Return modes:
146+
147+
This operation ignores non-gpu_launch ops and drops them in the return.
148+
149+
If any scf.foreach_thread with tensors is found, the transform definitely
150+
fails.
151+
152+
If all the scf.foreach_thread operations contained within the LaunchOp
153+
referred to by the `target` PDLOperation lower to GPU properly, the
154+
transform succeeds. Otherwise the transform definitely fails.
155+
156+
The returned handle points to the same LaunchOp operand, consuming it and
157+
producing a new SSA value to satisfy chaining and linearity of the IR
158+
properties.
159+
}];
160+
161+
let arguments = (ins PDL_Operation:$target,
162+
DefaultValuedAttr<I64ArrayAttr, "{}">:$gridDim,
163+
UnitAttr:$generate_gpu_launch);
164+
let results = (outs PDL_Operation:$result);
165+
166+
let assemblyFormat = "$target attr-dict";
167+
let extraClassDeclaration = [{
168+
::mlir::DiagnosedSilenceableFailure applyToOne(
169+
::mlir::Operation *target,
170+
::llvm::SmallVectorImpl<::mlir::Operation *> &results,
171+
::mlir::transform::TransformState &state);
172+
}];
173+
}
174+
175+
#endif // GPU_TRANSFORM_OPS

0 commit comments

Comments
 (0)