Skip to content

Commit 343bd8e

Browse files
authored
[Gluon] Expose inline_elementwise_asm (#7172)
I needed this to implement a performance workaround.
1 parent 811437e commit 343bd8e

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

python/test/gluon/test_frontend.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,3 +925,12 @@ def test_fence_async_shared():
925925

926926
# CHECK-NEXT: ttng.fence_async_shared {bCluster = true}
927927
blackwell.fence_async_shared(cluster=True)
928+
929+
930+
@filecheck_test
931+
@gluon.jit
932+
def test_inline_asm_elementwise():
933+
layout: ttgl.constexpr = ttgl.BlockedLayout([1], [32], [4], [0])
934+
x = ttgl.arange(0, 16, layout)
935+
# CHECK: elementwise_inline_asm {{.*}} : tensor<16xi32, [[BLOCKED:#.*]]> -> tensor<16xi32, [[BLOCKED]]>
936+
ttgl.inline_asm_elementwise("mov $0, $0;", "=r,r", [x], dtype=x.dtype, is_pure=True, pack=1)

python/triton/experimental/gluon/language/_core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
"store",
5858
"to_tensor",
5959
"where",
60+
"inline_asm_elementwise",
6061
]
6162

6263
__all__ = [

python/triton/language/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3077,7 +3077,7 @@ def kernel(A, B, C, D, BLOCK: tl.constexpr):
30773077
# Change the shape of each argument based on the broadcast shape
30783078
for i, item in enumerate(dispatch_args):
30793079
dispatch_args[i], _ = bin_op_type_checking(item, broadcast_arg)
3080-
res_tys = [block_type(dt, broadcast_arg.shape) for dt in dtype]
3080+
res_tys = [broadcast_arg.type.with_element_ty(dt) for dt in dtype]
30813081
handles = [t.handle for t in dispatch_args]
30823082
builder = _semantic.builder
30833083
call = builder.create_inline_asm(asm, constraints, handles, [ty.to_ir(builder) for ty in res_tys], is_pure, pack)

0 commit comments

Comments
 (0)