[mlir][python] Add Pythonic wrappers for gpu ops (llvm#163883)

ashermancinelli · aokblast · commit 31598027a8a8 · 2025-10-31T00:08:07.000+08:00
Add builders on the Python side that match builders in the C++ side, add tests for launching GPU kernels and regions, and correct some small documentation mistakes. This reflects the API decisions already made in the func dialect's Python bindings and makes use of the GPU dialect's bindings work more similar to C++ interface.
diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
@@ -121,7 +121,7 @@ func.func @main() {
     gpu.launch
         blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
         threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
-        gpu.printf "Hello from %d\n" %6 : index
+        gpu.printf "Hello from %d\n", %6 : index
         gpu.terminator
     }
     return
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
     This operation provides a memref pointer to the start of dynamic shared
     memory, often referred to as workgroup memory. It's important to note that
     this dynamic shared memory needs to be allocated at kernel launch. One can
-    conveniently utilize `the dynamic_shared_memory_size` parameter of
+    conveniently utilize the `dynamic_shared_memory_size` parameter of
     `gpu.launch` for this purpose.
 
     Examples:
diff --git a/mlir/python/mlir/dialects/gpu/__init__.py b/mlir/python/mlir/dialects/gpu/__init__.py
@@ -6,7 +6,7 @@
 from .._gpu_ops_gen import _Dialect
 from .._gpu_enum_gen import *
 from ..._mlir_libs._mlirDialectsGPU import *
-from typing import Callable, Sequence, Union, Optional, List
+from typing import Any, Callable, Sequence, Tuple, Union, Optional, List
 
 try:
     from ...ir import (
@@ -21,15 +21,24 @@
         DictAttr,
         Attribute,
         DenseI32ArrayAttr,
+        Value,
     )
+    from ...extras.meta import region_op
+    from ...extras import types as T
+    from ..arith import constant, ConstantOp
     from .._ods_common import (
         get_default_loc_context as _get_default_loc_context,
         _cext as _ods_cext,
+        get_op_result_or_op_results,
     )
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
 
+def gpu_async_token():
+    return Type.parse("!gpu.async.token")
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class GPUFuncOp(GPUFuncOp):
     __doc__ = GPUFuncOp.__doc__
@@ -151,3 +160,176 @@ def entry_block(self) -> Block:
     @property
     def arguments(self) -> Sequence[Type]:
         return self.function_type.value.inputs
+
+
+def _convert_literal_to_constant(value: Union[int, ConstantOp, Value]) -> Value:
+    if isinstance(value, int):
+        return constant(T.index(), value)
+    elif isinstance(value, (ConstantOp, Value)):
+        return value
+    else:
+        raise ValueError(f"Invalid value: {value}")
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchFuncOp(LaunchFuncOp):
+    __doc__ = LaunchFuncOp.__doc__
+
+    def __init__(
+        self,
+        kernel: List[str],
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        kernel_operands: Optional[List[Value]] = None,
+        async_dependencies: Optional[List[Value]] = None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        async_object=None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            kernel,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            kernel_operands,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            asyncObject=async_object,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def launch_func(
+    kernel: List[str],
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    kernel_operands: Optional[List[Value]] = None,
+    async_dependencies: Optional[List[Value]] = None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    async_object=None,
+    *,
+    loc=None,
+    ip=None,
+) -> Union[Value, List[Value], LaunchFuncOp]:
+    op = LaunchFuncOp(
+        kernel=kernel,
+        grid_size=grid_size,
+        block_size=block_size,
+        kernel_operands=kernel_operands,
+        async_dependencies=async_dependencies,
+        dynamic_shared_memory_size=dynamic_shared_memory_size,
+        async_object=async_object,
+        loc=loc,
+        ip=ip,
+    )
+    results = op.results
+    if len(results) == 1:
+        return results[0]
+    elif len(results) > 1:
+        return results
+    else:
+        return op
+
+
+def wait(
+    async_dependencies: Optional[List[Value]] = None, *, loc=None, ip=None
+) -> Union[Value, List[Value], WaitOp]:
+    if async_dependencies is None:
+        async_dependencies = []
+    return get_op_result_or_op_results(
+        WaitOp(gpu_async_token(), async_dependencies, loc=loc, ip=ip)
+    )
+
+
+@_ods_cext.register_operation(_Dialect, replace=True)
+class LaunchOp(LaunchOp):
+    __doc__ = LaunchOp.__doc__
+
+    def __init__(
+        self,
+        grid_size: Tuple[Any, Any, Any],
+        block_size: Tuple[Any, Any, Any],
+        async_dependencies=None,
+        dynamic_shared_memory_size: Optional[Value] = None,
+        *,
+        loc=None,
+        ip=None,
+    ):
+        if async_dependencies is None:
+            async_dependencies = []
+        async_token = None
+        if len(async_dependencies):
+            async_token = gpu_async_token()
+        grid_size_x, grid_size_y, grid_size_z = map(
+            _convert_literal_to_constant, grid_size
+        )
+        block_size_x, block_size_y, block_size_z = map(
+            _convert_literal_to_constant, block_size
+        )
+
+        super().__init__(
+            async_token,
+            async_dependencies,
+            grid_size_x,
+            grid_size_y,
+            grid_size_z,
+            block_size_x,
+            block_size_y,
+            block_size_z,
+            dynamicSharedMemorySize=dynamic_shared_memory_size,
+            loc=loc,
+            ip=ip,
+        )
+        self.regions[0].blocks.append(*[T.index() for _ in range(12)])
+
+
+def launch_(
+    grid_size: Tuple[Any, Any, Any],
+    block_size: Tuple[Any, Any, Any],
+    async_dependencies=None,
+    dynamic_shared_memory_size: Optional[Value] = None,
+    *,
+    loc=None,
+    ip=None,
+):
+    grid_size = tuple(map(_convert_literal_to_constant, grid_size))
+    block_size = tuple(map(_convert_literal_to_constant, block_size))
+    launch_op = LaunchOp(
+        grid_size,
+        block_size,
+        async_dependencies,
+        dynamic_shared_memory_size,
+        loc=loc,
+        ip=ip,
+    )
+    return launch_op
+
+
+launch = region_op(launch_, terminator=lambda *_args: terminator())
+
+
+_printf = printf
+
+
+def printf(format, *args, loc=None, ip=None):
+    return _printf(format=format, args=args, loc=loc, ip=ip)
diff --git a/mlir/test/python/dialects/gpu/dialect.py b/mlir/test/python/dialects/gpu/dialect.py
@@ -2,7 +2,8 @@
 
 from mlir.ir import *
 import mlir.ir as ir
-import mlir.dialects.gpu as gpu
+from mlir.dialects import gpu, func, arith, math
+from mlir.extras import types as T
 import mlir.dialects.gpu.passes
 from mlir.passmanager import *
 
@@ -157,3 +158,96 @@ def builder(func: gpu.GPUFuncOp) -> None:
     # CHECK:   %[[VAL_0:.*]] = gpu.global_id  x
     # CHECK:   gpu.return
     # CHECK: }
+
+
+# CHECK-LABEL: testGPULaunchFuncOp
+@run
+def testGPULaunchFuncOp():
+    module = Module.create()
+
+    module.operation.attributes["gpu.container_module"] = UnitAttr.get()
+    with InsertionPoint(module.body):
+        gpu_module = gpu.GPUModuleOp("gpu_module")
+        block = gpu_module.bodyRegion.blocks.append()
+
+    with InsertionPoint(block):
+        gpu_func = gpu.GPUFuncOp(
+            FunctionType.get([], []),
+            "kernel",
+            body_builder=lambda func: gpu.return_([]),
+            kernel=True,
+        )
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([], []), name="host")
+
+    with InsertionPoint(host.add_entry_block()):
+        c1 = arith.constant(T.index(), 1)
+        grid_sizes = (1, 1, 1)
+        block_sizes = (1, 1, 1)
+        token = gpu.wait()
+        token = gpu.launch_func(
+            async_dependencies=[token],
+            kernel=[gpu_module.sym_name.value, gpu_func.name.value],
+            grid_size=grid_sizes,
+            block_size=block_sizes,
+            kernel_operands=[],
+        )
+        gpu.wait(async_dependencies=[token])
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   gpu.module @gpu_module {
+    # CHECK:           gpu.func @kernel() kernel {
+    # CHECK:             gpu.return
+    # CHECK:           }
+    # CHECK:         }
+
+    # CHECK-LABEL:   func.func @host() {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           %[[WAIT_0:.*]] = gpu.wait async
+    # CHECK:           %[[CONSTANT_1:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_2:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_3:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_4:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_5:.*]] = arith.constant 1 : index
+    # CHECK:           %[[CONSTANT_6:.*]] = arith.constant 1 : index
+    # CHECK:           %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]]) threads in (%[[CONSTANT_4]], %[[CONSTANT_5]], %[[CONSTANT_6]])
+    # CHECK:           %[[WAIT_1:.*]] = gpu.wait async {{\[}}%[[LAUNCH_FUNC_0]]]
+    # CHECK:           return
+    # CHECK:         }
+
+
+# CHECK-LABEL: testGPULaunchOp
+@run
+def testGPULaunchOp():
+    module = Module.create()
+
+    with InsertionPoint(module.body):
+        host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
+
+    entry_block = host.add_entry_block()
+    with InsertionPoint(entry_block):
+        c1 = arith.constant(T.index(), 1)
+        grid_sizes = (c1, c1, c1)
+        block_sizes = (c1, c1, c1)
+
+        launch = gpu.launch(grid_sizes, block_sizes)
+
+    op = launch(lambda *args: gpu.printf("%f", args[0]))
+
+    with InsertionPoint(entry_block):
+        func.ReturnOp([])
+
+    print(module)
+
+    # CHECK-LABEL:   func.func @gpu_printf(
+    # CHECK-SAME:      %[[ARG0:.*]]: f32) {
+    # CHECK:           %[[CONSTANT_0:.*]] = arith.constant 1 : index
+    # CHECK:           gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
+    # CHECK:             gpu.printf "%[[VAL_12:.*]]", %[[VAL_0]] : index
+    # CHECK:             gpu.terminator
+    # CHECK:           }
+    # CHECK:           return
+    # CHECK:         }

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ func.func @main() {`
`121`	`121`	`gpu.launch`
`122`	`122`	`blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)`
`123`	`123`	`threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {`
`124`		`- gpu.printf "Hello from %d\n" %6 : index`
	`124`	`+ gpu.printf "Hello from %d\n", %6 : index`
`125`	`125`	`gpu.terminator`
`126`	`126`	`}`
`127`	`127`	`return`