Skip to content

Commit 1db01a2

Browse files
[mlir][python] Add tests for gpu.launch(_func) ops
These are the tests I wish I could have referred to during development. Also corrected some small documentation mistakes.
1 parent d4b1ab7 commit 1db01a2

File tree

3 files changed

+100
-3
lines changed

3 files changed

+100
-3
lines changed

mlir/docs/Dialects/GPU.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func.func @main() {
121121
gpu.launch
122122
blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
123123
threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
124-
gpu.printf "Hello from %d\n" %6 : index
124+
gpu.printf "Hello from %d\n", %6 : index
125125
gpu.terminator
126126
}
127127
return

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
584584
This operation provides a memref pointer to the start of dynamic shared
585585
memory, often referred to as workgroup memory. It's important to note that
586586
this dynamic shared memory needs to be allocated at kernel launch. One can
587-
conveniently utilize `the dynamic_shared_memory_size` parameter of
587+
conveniently utilize the `dynamic_shared_memory_size` parameter of
588588
`gpu.launch` for this purpose.
589589

590590
Examples:

mlir/test/python/dialects/gpu/dialect.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from mlir.ir import *
44
import mlir.ir as ir
5-
import mlir.dialects.gpu as gpu
5+
from mlir.dialects import gpu, func, arith, math
6+
from mlir.extras import types as T
67
import mlir.dialects.gpu.passes
78
from mlir.passmanager import *
89

@@ -157,3 +158,99 @@ def builder(func: gpu.GPUFuncOp) -> None:
157158
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
158159
# CHECK: gpu.return
159160
# CHECK: }
161+
162+
# CHECK-LABEL: testGPULaunchFuncOp
163+
@run
164+
def testGPULaunchFuncOp():
165+
module = Module.create()
166+
167+
module.operation.attributes["gpu.container_module"] = UnitAttr.get()
168+
with InsertionPoint(module.body):
169+
gpu_module = gpu.GPUModuleOp("gpu_module")
170+
block = gpu_module.bodyRegion.blocks.append()
171+
172+
with InsertionPoint(block):
173+
gpu_func = gpu.GPUFuncOp(
174+
FunctionType.get([], []),
175+
"kernel",
176+
body_builder=lambda func: gpu.return_([]),
177+
kernel=True,
178+
)
179+
180+
with InsertionPoint(module.body):
181+
host = func.FuncOp(type=FunctionType.get([], []), name="host")
182+
183+
with InsertionPoint(host.add_entry_block()):
184+
c1 = arith.constant(T.index(), 1)
185+
grid_sizes = [c1] * 3
186+
block_sizes = [c1] * 3
187+
sym_ref = SymbolRefAttr.get([gpu_module.sym_name.value, gpu_func.name.value])
188+
token_type = Type.parse("!gpu.async.token")
189+
token = gpu.wait(async_token=token_type, async_dependencies=[])
190+
token = gpu.launch_func(
191+
async_token=token_type,
192+
async_dependencies=[token],
193+
kernel=sym_ref,
194+
grid_size_x=grid_sizes[0],
195+
grid_size_y=grid_sizes[1],
196+
grid_size_z=grid_sizes[2],
197+
block_size_x=block_sizes[0],
198+
block_size_y=block_sizes[1],
199+
block_size_z=block_sizes[2],
200+
kernel_operands=[],
201+
)
202+
gpu.wait(async_token=None, async_dependencies=[token])
203+
func.ReturnOp([])
204+
205+
print(module)
206+
207+
# CHECK-LABEL: gpu.module @gpu_module {
208+
# CHECK: gpu.func @kernel() kernel {
209+
# CHECK: gpu.return
210+
# CHECK: }
211+
# CHECK: }
212+
213+
# CHECK-LABEL: func.func @host() {
214+
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
215+
# CHECK: %[[WAIT_0:.*]] = gpu.wait async
216+
# CHECK: %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]) threads in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]])
217+
# CHECK: gpu.wait {{\[}}%[[LAUNCH_FUNC_0]]]
218+
# CHECK: return
219+
# CHECK: }
220+
221+
222+
# CHECK-LABEL: testGPULaunchOp
223+
@run
224+
def testGPULaunchOp():
225+
module = Module.create()
226+
227+
with InsertionPoint(module.body):
228+
host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
229+
230+
entry_block = host.add_entry_block()
231+
with InsertionPoint(entry_block):
232+
c1 = arith.constant(T.index(), 1)
233+
234+
launch = gpu.launch(None, [], c1, c1, c1, c1, c1, c1)
235+
launch_block = launch.regions[0].blocks.append()
236+
for _ in range(12):
237+
launch_block.add_argument(T.index(), Location.unknown())
238+
239+
with InsertionPoint(launch_block):
240+
gpu.printf("%f", [entry_block.arguments[0]])
241+
gpu.terminator()
242+
243+
with InsertionPoint(entry_block):
244+
func.ReturnOp([])
245+
246+
print(module)
247+
248+
# CHECK-LABEL: func.func @gpu_printf(
249+
# CHECK-SAME: %[[ARG0:.*]]: f32) {
250+
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
251+
# CHECK: gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
252+
# CHECK: gpu.printf "%[[VAL_12:.*]]", %[[ARG0]] : f32
253+
# CHECK: gpu.terminator
254+
# CHECK: }
255+
# CHECK: return
256+
# CHECK: }

0 commit comments

Comments
 (0)