[Gluon] Expose inline_elementwise_asm (#7172)

Mogball · web-flow · commit 343bd8e711f0 · 2025-06-12T15:53:27.000-04:00
I needed this to implement a performance workaround.
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -925,3 +925,12 @@ def test_fence_async_shared():
 
     # CHECK-NEXT: ttng.fence_async_shared {bCluster = true}
     blackwell.fence_async_shared(cluster=True)
+
+
+@filecheck_test
+@gluon.jit
+def test_inline_asm_elementwise():
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1], [32], [4], [0])
+    x = ttgl.arange(0, 16, layout)
+    # CHECK: elementwise_inline_asm {{.*}} : tensor<16xi32, [[BLOCKED:#.*]]> -> tensor<16xi32, [[BLOCKED]]>
+    ttgl.inline_asm_elementwise("mov $0, $0;", "=r,r", [x], dtype=x.dtype, is_pure=True, pack=1)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -57,6 +57,7 @@
     "store",
     "to_tensor",
     "where",
+    "inline_asm_elementwise",
 ]
 
 __all__ = [
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -3077,7 +3077,7 @@ def kernel(A, B, C, D, BLOCK: tl.constexpr):
             # Change the shape of each argument based on the broadcast shape
             for i, item in enumerate(dispatch_args):
                 dispatch_args[i], _ = bin_op_type_checking(item, broadcast_arg)
-            res_tys = [block_type(dt, broadcast_arg.shape) for dt in dtype]
+            res_tys = [broadcast_arg.type.with_element_ty(dt) for dt in dtype]
     handles = [t.handle for t in dispatch_args]
     builder = _semantic.builder
     call = builder.create_inline_asm(asm, constraints, handles, [ty.to_ir(builder) for ty in res_tys], is_pure, pack)

Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`"store",`
`58`	`58`	`"to_tensor",`
`59`	`59`	`"where",`
	`60`	`+ "inline_asm_elementwise",`
`60`	`61`	`]`
`61`	`62`
`62`	`63`	`__all__ = [`