Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mlir/docs/Dialects/GPU.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func.func @main() {
gpu.launch
blocks(%0, %1, %2) in (%3 = %c1, %4 = %c1, %5 = %c1)
threads(%6, %7, %8) in (%9 = %c2, %10 = %c1, %11 = %c1) {
gpu.printf "Hello from %d\n" %6 : index
gpu.printf "Hello from %d\n", %6 : index
gpu.terminator
}
return
Expand Down
2 changes: 1 addition & 1 deletion mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -584,7 +584,7 @@ def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
This operation provides a memref pointer to the start of dynamic shared
memory, often referred to as workgroup memory. It's important to note that
this dynamic shared memory needs to be allocated at kernel launch. One can
conveniently utilize `the dynamic_shared_memory_size` parameter of
conveniently utilize the `dynamic_shared_memory_size` parameter of
`gpu.launch` for this purpose.

Examples:
Expand Down
99 changes: 98 additions & 1 deletion mlir/test/python/dialects/gpu/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from mlir.ir import *
import mlir.ir as ir
import mlir.dialects.gpu as gpu
from mlir.dialects import gpu, func, arith, math
from mlir.extras import types as T
import mlir.dialects.gpu.passes
from mlir.passmanager import *

Expand Down Expand Up @@ -157,3 +158,99 @@ def builder(func: gpu.GPUFuncOp) -> None:
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
# CHECK: gpu.return
# CHECK: }

# CHECK-LABEL: testGPULaunchFuncOp
@run
def testGPULaunchFuncOp():
module = Module.create()

module.operation.attributes["gpu.container_module"] = UnitAttr.get()
with InsertionPoint(module.body):
gpu_module = gpu.GPUModuleOp("gpu_module")
block = gpu_module.bodyRegion.blocks.append()

with InsertionPoint(block):
gpu_func = gpu.GPUFuncOp(
FunctionType.get([], []),
"kernel",
body_builder=lambda func: gpu.return_([]),
kernel=True,
)

with InsertionPoint(module.body):
host = func.FuncOp(type=FunctionType.get([], []), name="host")

with InsertionPoint(host.add_entry_block()):
c1 = arith.constant(T.index(), 1)
grid_sizes = [c1] * 3
block_sizes = [c1] * 3
sym_ref = SymbolRefAttr.get([gpu_module.sym_name.value, gpu_func.name.value])
token_type = Type.parse("!gpu.async.token")
token = gpu.wait(async_token=token_type, async_dependencies=[])
token = gpu.launch_func(
Copy link
Contributor

@makslevental makslevental Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can just grab this https://github.com/makslevental/mlir-python-extras/blob/main/mlir/extras/dialects/ext/gpu.py#L339-L379 (which supports exactly what you're saying - 3-tuples), put it in mlir/dialects/gpu.py (along with the standard register_operation thing) and then just below wrap it in launch_func (thereby shadowing auto-generated launch_func).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That all looks great to me, but are we worried about forcing folks to update their code if it depends on the Python bindings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what do you mean? oh you're saying since this meaningfully changes the signature of both these existing APIs (LaunchFuncOp and launch_func) relative to the auto-generated ones?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right - anyone using either api will need to update their code. That's annoying! But maybe worth it. Just making sure 😄

Copy link
Contributor

@makslevental makslevental Oct 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The python APIs aren't stable (ie we make no stability guarantees). So basically this same "breakage" occurs whenever we add one of these nicer builders. Also there's a simple "migration path": people can just import the generated original APIs directly from _gpu_ops_gen if they really want to keep using the old (auto-generated) APIs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since users always have an easy way to get their old bindings back, I feel better about adding the Python-extras builders. I'll pull in the builder you linked, and I'll be ready to merge if I get an OK from Mehdi or Guray. Thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @makslevental about the stability guarantees. MLIR is more progressive compared to LLVM — there’s no API stability guarantee as long as you can migrate to something better.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to know 😃

async_token=token_type,
async_dependencies=[token],
kernel=sym_ref,
grid_size_x=grid_sizes[0],
grid_size_y=grid_sizes[1],
grid_size_z=grid_sizes[2],
block_size_x=block_sizes[0],
block_size_y=block_sizes[1],
block_size_z=block_sizes[2],
kernel_operands=[],
)
gpu.wait(async_token=None, async_dependencies=[token])
func.ReturnOp([])

print(module)

# CHECK-LABEL: gpu.module @gpu_module {
# CHECK: gpu.func @kernel() kernel {
# CHECK: gpu.return
# CHECK: }
# CHECK: }

# CHECK-LABEL: func.func @host() {
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
# CHECK: %[[WAIT_0:.*]] = gpu.wait async
# CHECK: %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]]) threads in (%[[CONSTANT_0]], %[[CONSTANT_0]], %[[CONSTANT_0]])
# CHECK: gpu.wait {{\[}}%[[LAUNCH_FUNC_0]]]
# CHECK: return
# CHECK: }


# CHECK-LABEL: testGPULaunchOp
@run
def testGPULaunchOp():
module = Module.create()

with InsertionPoint(module.body):
host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")

entry_block = host.add_entry_block()
with InsertionPoint(entry_block):
c1 = arith.constant(T.index(), 1)

launch = gpu.launch(None, [], c1, c1, c1, c1, c1, c1)
launch_block = launch.regions[0].blocks.append()
for _ in range(12):
launch_block.add_argument(T.index(), Location.unknown())

with InsertionPoint(launch_block):
gpu.printf("%f", [entry_block.arguments[0]])
gpu.terminator()

with InsertionPoint(entry_block):
func.ReturnOp([])

print(module)

# CHECK-LABEL: func.func @gpu_printf(
# CHECK-SAME: %[[ARG0:.*]]: f32) {
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
# CHECK: gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
# CHECK: gpu.printf "%[[VAL_12:.*]]", %[[ARG0]] : f32
# CHECK: gpu.terminator
# CHECK: }
# CHECK: return
# CHECK: }
Loading