[gluon] fix some AMD compilation issues + skip tests on AMD for now (#7215)

SamGinzburg · web-flow · commit aaa3d821d479 · 2025-06-19T22:33:35.000Z
Fixes some minor AMD compilation issues.

Some tests in test_frontend are skipped since they hardcode tpw=32 (and
some use nvidia layouts), so I'm skipping these tests for now (they
should probably be re-enabled in the future for AMD where possible).
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -17,15 +17,18 @@ def copy_kernel(Out, In, numel, XBLOCK: ttgl.constexpr, layout: ttgl.constexpr):
     ttgl.store(Out + xoffset, data, xmask)
 
 
+copy_kernel_tpw = [32] if is_cuda() else [64]
+
+
 @pytest.mark.parametrize("layout", [
-    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[32], warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=[32], warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=[32], warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[32], warps_per_cta=[4], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[32], warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=[32], warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=[32], warps_per_cta=[8], order=[0]),
-    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=[32], warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=copy_kernel_tpw, warps_per_cta=[4], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[2], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[4], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
+    ttgl.BlockedLayout(size_per_thread=[8], threads_per_warp=copy_kernel_tpw, warps_per_cta=[8], order=[0]),
 ])
 @pytest.mark.parametrize("XBLOCK", [128, 256, 512, 1024, 2048])
 def test_copy_kernel(layout, XBLOCK):
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -28,6 +28,7 @@ def convert_layout_kernel(XBLOCK: ttgl.constexpr, layout_a: ttgl.constexpr, layo
     res = ttgl.convert_layout(x, layout_b)  # noqa: F841
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_convert_layout(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -70,6 +71,7 @@ def shared_memory_kernel(XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr, layout_
     unused._keep_alive()
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_shared_memory(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -170,6 +172,7 @@ def shared_memory_subview_kernel(XBLOCK: ttgl.constexpr, layout: ttgl.constexpr,
     view.store(value.trans())
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_shared_memory_subview(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -208,6 +211,7 @@ def shared_memory_index_kernel(XBLOCK: ttgl.constexpr, layout: ttgl.constexpr, s
         smem.index(i).load(layout)
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_shared_memory_index(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -263,6 +267,7 @@ def shared_memory_cast_kernel():
     smem._reinterpret(ttgl.int8, [1024], ttgl.SwizzledSharedLayout(1, 1, 1, [0, 1]))
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_shared_memory_cast(fresh_knobs):
     expecttest.assert_expected_inline(
         anonymize_ir(run_parser(shared_memory_cast_kernel).str_nodebug()), """\
@@ -630,6 +635,7 @@ def broadcast_kernel():
     0 + a + b
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_broadcast(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -684,6 +690,7 @@ def math_kernel():
     ttgl.fma(a, b, c)
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_math(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -754,6 +761,7 @@ def reduce_kernel(out):
     tl.store(out + ttgl.arange(0, 16, s0.type.layout), result)
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_reduce(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
@@ -802,6 +810,7 @@ def test_reduce(fresh_knobs):
 """)
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 @filecheck_test
 @gluon.jit
 def test_elementwise_core():
@@ -829,6 +838,7 @@ def linear_layout_kernel():
     ttgl.arange(0, 256, layout=ll)
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_linear_layout(fresh_knobs):
     knobs.compilation.disable_line_info = True
     h = linear_layout_kernel.warmup(grid=(1, ))
diff --git a/python/triton/experimental/gluon/_runtime.py b/python/triton/experimental/gluon/_runtime.py
@@ -27,11 +27,19 @@ def make_ir(self, options, codegen_fns, module_map, context):
         target = triton.runtime.driver.active.get_current_target()
         backend = make_backend(target)
         target = backend.get_target_name(options)
+
         module.set_attr("ttg.target", builder.get_string_attr(target))
         module.set_attr("ttg.num-warps", builder.get_int32_attr(options.num_warps))
         module.set_attr("ttg.num-ctas", builder.get_int32_attr(options.num_ctas))
-        module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(32))
-        if options.maxnreg is not None:
+
+        is_cuda = options.backend_name == "cuda"
+
+        if is_cuda:
+            module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(32))
+        else:
+            module.set_attr("ttg.threads-per-warp", builder.get_int32_attr(64))
+
+        if is_cuda and options.maxnreg is not None:
             module.set_attr("ttg.maxnreg", builder.get_int32_attr(options.maxnreg))
 
         module = ast_to_ttir(self.fn, self, context=context, options=options, codegen_fns=codegen_fns,