Skip to content

Commit 3159802

Browse files
ashermancinelliaokblast
authored andcommitted
[mlir][python] Add Pythonic wrappers for gpu ops (llvm#163883)
Add builders on the Python side that match builders in the C++ side, add tests for launching GPU kernels and regions, and correct some small documentation mistakes. This reflects the API decisions already made in the func dialect's Python bindings and makes use of the GPU dialect's bindings work more similar to C++ interface.
1 parent 53d0c4c commit 3159802

File tree

4 files changed

+280
-4
lines changed

4 files changed

+280
-4
lines changed

mlir/docs/Dialects/GPU.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ func.func @main() {
121121
gpu.launch
122122
blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
123123
threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
124-
gpu.printf "Hello from %d\n" %6 : index
124+
gpu.printf "Hello from %d\n", %6 : index
125125
gpu.terminator
126126
}
127127
return

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
584584
This operation provides a memref pointer to the start of dynamic shared
585585
memory, often referred to as workgroup memory. It's important to note that
586586
this dynamic shared memory needs to be allocated at kernel launch. One can
587-
conveniently utilize `the dynamic_shared_memory_size` parameter of
587+
conveniently utilize the `dynamic_shared_memory_size` parameter of
588588
`gpu.launch` for this purpose.
589589

590590
Examples:

mlir/python/mlir/dialects/gpu/__init__.py

Lines changed: 183 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .._gpu_ops_gen import _Dialect
77
from .._gpu_enum_gen import *
88
from ..._mlir_libs._mlirDialectsGPU import *
9-
from typing import Callable, Sequence, Union, Optional, List
9+
from typing import Any, Callable, Sequence, Tuple, Union, Optional, List
1010

1111
try:
1212
from ...ir import (
@@ -21,15 +21,24 @@
2121
DictAttr,
2222
Attribute,
2323
DenseI32ArrayAttr,
24+
Value,
2425
)
26+
from ...extras.meta import region_op
27+
from ...extras import types as T
28+
from ..arith import constant, ConstantOp
2529
from .._ods_common import (
2630
get_default_loc_context as _get_default_loc_context,
2731
_cext as _ods_cext,
32+
get_op_result_or_op_results,
2833
)
2934
except ImportError as e:
3035
raise RuntimeError("Error loading imports from extension module") from e
3136

3237

38+
def gpu_async_token():
39+
return Type.parse("!gpu.async.token")
40+
41+
3342
@_ods_cext.register_operation(_Dialect, replace=True)
3443
class GPUFuncOp(GPUFuncOp):
3544
__doc__ = GPUFuncOp.__doc__
@@ -151,3 +160,176 @@ def entry_block(self) -> Block:
151160
@property
152161
def arguments(self) -> Sequence[Type]:
153162
return self.function_type.value.inputs
163+
164+
165+
def _convert_literal_to_constant(value: Union[int, ConstantOp, Value]) -> Value:
166+
if isinstance(value, int):
167+
return constant(T.index(), value)
168+
elif isinstance(value, (ConstantOp, Value)):
169+
return value
170+
else:
171+
raise ValueError(f"Invalid value: {value}")
172+
173+
174+
@_ods_cext.register_operation(_Dialect, replace=True)
175+
class LaunchFuncOp(LaunchFuncOp):
176+
__doc__ = LaunchFuncOp.__doc__
177+
178+
def __init__(
179+
self,
180+
kernel: List[str],
181+
grid_size: Tuple[Any, Any, Any],
182+
block_size: Tuple[Any, Any, Any],
183+
kernel_operands: Optional[List[Value]] = None,
184+
async_dependencies: Optional[List[Value]] = None,
185+
dynamic_shared_memory_size: Optional[Value] = None,
186+
async_object=None,
187+
*,
188+
loc=None,
189+
ip=None,
190+
):
191+
if async_dependencies is None:
192+
async_dependencies = []
193+
async_token = None
194+
if len(async_dependencies):
195+
async_token = gpu_async_token()
196+
197+
grid_size_x, grid_size_y, grid_size_z = map(
198+
_convert_literal_to_constant, grid_size
199+
)
200+
block_size_x, block_size_y, block_size_z = map(
201+
_convert_literal_to_constant, block_size
202+
)
203+
204+
super().__init__(
205+
async_token,
206+
async_dependencies,
207+
kernel,
208+
grid_size_x,
209+
grid_size_y,
210+
grid_size_z,
211+
block_size_x,
212+
block_size_y,
213+
block_size_z,
214+
kernel_operands,
215+
dynamicSharedMemorySize=dynamic_shared_memory_size,
216+
asyncObject=async_object,
217+
loc=loc,
218+
ip=ip,
219+
)
220+
221+
222+
def launch_func(
223+
kernel: List[str],
224+
grid_size: Tuple[Any, Any, Any],
225+
block_size: Tuple[Any, Any, Any],
226+
kernel_operands: Optional[List[Value]] = None,
227+
async_dependencies: Optional[List[Value]] = None,
228+
dynamic_shared_memory_size: Optional[Value] = None,
229+
async_object=None,
230+
*,
231+
loc=None,
232+
ip=None,
233+
) -> Union[Value, List[Value], LaunchFuncOp]:
234+
op = LaunchFuncOp(
235+
kernel=kernel,
236+
grid_size=grid_size,
237+
block_size=block_size,
238+
kernel_operands=kernel_operands,
239+
async_dependencies=async_dependencies,
240+
dynamic_shared_memory_size=dynamic_shared_memory_size,
241+
async_object=async_object,
242+
loc=loc,
243+
ip=ip,
244+
)
245+
results = op.results
246+
if len(results) == 1:
247+
return results[0]
248+
elif len(results) > 1:
249+
return results
250+
else:
251+
return op
252+
253+
254+
def wait(
255+
async_dependencies: Optional[List[Value]] = None, *, loc=None, ip=None
256+
) -> Union[Value, List[Value], WaitOp]:
257+
if async_dependencies is None:
258+
async_dependencies = []
259+
return get_op_result_or_op_results(
260+
WaitOp(gpu_async_token(), async_dependencies, loc=loc, ip=ip)
261+
)
262+
263+
264+
@_ods_cext.register_operation(_Dialect, replace=True)
265+
class LaunchOp(LaunchOp):
266+
__doc__ = LaunchOp.__doc__
267+
268+
def __init__(
269+
self,
270+
grid_size: Tuple[Any, Any, Any],
271+
block_size: Tuple[Any, Any, Any],
272+
async_dependencies=None,
273+
dynamic_shared_memory_size: Optional[Value] = None,
274+
*,
275+
loc=None,
276+
ip=None,
277+
):
278+
if async_dependencies is None:
279+
async_dependencies = []
280+
async_token = None
281+
if len(async_dependencies):
282+
async_token = gpu_async_token()
283+
grid_size_x, grid_size_y, grid_size_z = map(
284+
_convert_literal_to_constant, grid_size
285+
)
286+
block_size_x, block_size_y, block_size_z = map(
287+
_convert_literal_to_constant, block_size
288+
)
289+
290+
super().__init__(
291+
async_token,
292+
async_dependencies,
293+
grid_size_x,
294+
grid_size_y,
295+
grid_size_z,
296+
block_size_x,
297+
block_size_y,
298+
block_size_z,
299+
dynamicSharedMemorySize=dynamic_shared_memory_size,
300+
loc=loc,
301+
ip=ip,
302+
)
303+
self.regions[0].blocks.append(*[T.index() for _ in range(12)])
304+
305+
306+
def launch_(
307+
grid_size: Tuple[Any, Any, Any],
308+
block_size: Tuple[Any, Any, Any],
309+
async_dependencies=None,
310+
dynamic_shared_memory_size: Optional[Value] = None,
311+
*,
312+
loc=None,
313+
ip=None,
314+
):
315+
grid_size = tuple(map(_convert_literal_to_constant, grid_size))
316+
block_size = tuple(map(_convert_literal_to_constant, block_size))
317+
launch_op = LaunchOp(
318+
grid_size,
319+
block_size,
320+
async_dependencies,
321+
dynamic_shared_memory_size,
322+
loc=loc,
323+
ip=ip,
324+
)
325+
return launch_op
326+
327+
328+
launch = region_op(launch_, terminator=lambda *_args: terminator())
329+
330+
331+
_printf = printf
332+
333+
334+
def printf(format, *args, loc=None, ip=None):
335+
return _printf(format=format, args=args, loc=loc, ip=ip)

mlir/test/python/dialects/gpu/dialect.py

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
from mlir.ir import *
44
import mlir.ir as ir
5-
import mlir.dialects.gpu as gpu
5+
from mlir.dialects import gpu, func, arith, math
6+
from mlir.extras import types as T
67
import mlir.dialects.gpu.passes
78
from mlir.passmanager import *
89

@@ -157,3 +158,96 @@ def builder(func: gpu.GPUFuncOp) -> None:
157158
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
158159
# CHECK: gpu.return
159160
# CHECK: }
161+
162+
163+
# CHECK-LABEL: testGPULaunchFuncOp
164+
@run
165+
def testGPULaunchFuncOp():
166+
module = Module.create()
167+
168+
module.operation.attributes["gpu.container_module"] = UnitAttr.get()
169+
with InsertionPoint(module.body):
170+
gpu_module = gpu.GPUModuleOp("gpu_module")
171+
block = gpu_module.bodyRegion.blocks.append()
172+
173+
with InsertionPoint(block):
174+
gpu_func = gpu.GPUFuncOp(
175+
FunctionType.get([], []),
176+
"kernel",
177+
body_builder=lambda func: gpu.return_([]),
178+
kernel=True,
179+
)
180+
181+
with InsertionPoint(module.body):
182+
host = func.FuncOp(type=FunctionType.get([], []), name="host")
183+
184+
with InsertionPoint(host.add_entry_block()):
185+
c1 = arith.constant(T.index(), 1)
186+
grid_sizes = (1, 1, 1)
187+
block_sizes = (1, 1, 1)
188+
token = gpu.wait()
189+
token = gpu.launch_func(
190+
async_dependencies=[token],
191+
kernel=[gpu_module.sym_name.value, gpu_func.name.value],
192+
grid_size=grid_sizes,
193+
block_size=block_sizes,
194+
kernel_operands=[],
195+
)
196+
gpu.wait(async_dependencies=[token])
197+
func.ReturnOp([])
198+
199+
print(module)
200+
201+
# CHECK-LABEL: gpu.module @gpu_module {
202+
# CHECK: gpu.func @kernel() kernel {
203+
# CHECK: gpu.return
204+
# CHECK: }
205+
# CHECK: }
206+
207+
# CHECK-LABEL: func.func @host() {
208+
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
209+
# CHECK: %[[WAIT_0:.*]] = gpu.wait async
210+
# CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : index
211+
# CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
212+
# CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : index
213+
# CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
214+
# CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
215+
# CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
216+
# CHECK: %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]]) threads in (%[[CONSTANT_4]], %[[CONSTANT_5]], %[[CONSTANT_6]])
217+
# CHECK: %[[WAIT_1:.*]] = gpu.wait async {{\[}}%[[LAUNCH_FUNC_0]]]
218+
# CHECK: return
219+
# CHECK: }
220+
221+
222+
# CHECK-LABEL: testGPULaunchOp
223+
@run
224+
def testGPULaunchOp():
225+
module = Module.create()
226+
227+
with InsertionPoint(module.body):
228+
host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
229+
230+
entry_block = host.add_entry_block()
231+
with InsertionPoint(entry_block):
232+
c1 = arith.constant(T.index(), 1)
233+
grid_sizes = (c1, c1, c1)
234+
block_sizes = (c1, c1, c1)
235+
236+
launch = gpu.launch(grid_sizes, block_sizes)
237+
238+
op = launch(lambda *args: gpu.printf("%f", args[0]))
239+
240+
with InsertionPoint(entry_block):
241+
func.ReturnOp([])
242+
243+
print(module)
244+
245+
# CHECK-LABEL: func.func @gpu_printf(
246+
# CHECK-SAME: %[[ARG0:.*]]: f32) {
247+
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
248+
# CHECK: gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
249+
# CHECK: gpu.printf "%[[VAL_12:.*]]", %[[VAL_0]] : index
250+
# CHECK: gpu.terminator
251+
# CHECK: }
252+
# CHECK: return
253+
# CHECK: }

0 commit comments

Comments
 (0)