[Gluon] Fix a few things in the translator (#8569)

Mogball · web-flow · commit a9c3322bb353 · 2025-11-07T09:05:20.000-08:00
* Add missing scatter conversion
* Add mma_v2 path
* Fix TMEM scales register layout
* Change `convert_triton_to_gluon` to accept multiple root kernels,
allowing a single source to be generated from them (which reuses
functions across them)
* Add missing `fence_async_shared` in TMA store 
* Add missing APIs for scales layout class
* Fix ttgl.store broadcasting of scalars
* Fix CTA layout canonicalization in Gluon
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -104,6 +104,8 @@ struct GluonLayouts {
   py::handle DistributedLinearLayout;
   py::handle DotOperandLayout;
   py::handle NVMMADistributedLayout;
+  py::handle TensorMemoryScalesLayout;
+  py::handle TensorMemoryLayout;
   py::handle NVMMASharedLayout;
   py::handle SwizzledSharedLayout;
   py::handle SharedLinearLayout;
@@ -116,6 +118,8 @@ struct GluonLayouts {
         py::module::import("triton.experimental.gluon.language._layouts");
     auto amdLayouts =
         py::module::import("triton.experimental.gluon.language.amd._layouts");
+    auto blackwellLayouts = py::module::import(
+        "triton.experimental.gluon.language.nvidia.blackwell");
     AutoLayout = py::object(layouts.attr("AutoLayout")).release();
     BlockedLayout = py::object(layouts.attr("BlockedLayout")).release();
     SliceLayout = py::object(layouts.attr("SliceLayout")).release();
@@ -124,6 +128,10 @@ struct GluonLayouts {
     DotOperandLayout = py::object(layouts.attr("DotOperandLayout")).release();
     NVMMADistributedLayout =
         py::object(layouts.attr("NVMMADistributedLayout")).release();
+    TensorMemoryScalesLayout =
+        py::object(blackwellLayouts.attr("TensorMemoryScalesLayout")).release();
+    TensorMemoryLayout =
+        py::object(blackwellLayouts.attr("TensorMemoryLayout")).release();
     NVMMASharedLayout = py::object(layouts.attr("NVMMASharedLayout")).release();
     SwizzledSharedLayout =
         py::object(layouts.attr("SwizzledSharedLayout")).release();
@@ -256,6 +264,15 @@ py::object layoutToGluon(Attribute layout) {
     return layouts.PaddedSharedLayout(intervalPaddingPairs,
                                       ll.getBases().lookup(kOffset),
                                       ll.getBases().lookup(kBlock), shape);
+  } else if (auto tmemScales =
+                 dyn_cast<ttng::TensorMemoryScalesEncodingAttr>(layout)) {
+    return layouts.TensorMemoryScalesLayout(std::vector<unsigned>{
+        tmemScales.getCTASplitM(), tmemScales.getCTASplitN()});
+  } else if (auto tmem = dyn_cast<ttng::TensorMemoryEncodingAttr>(layout)) {
+    return layouts.TensorMemoryLayout(
+        std::vector<unsigned>{tmem.getBlockM(), tmem.getBlockN()},
+        tmem.getColStride(),
+        std::vector<unsigned>{tmem.getCTASplitM(), tmem.getCTASplitN()});
   }
 
   throw py::value_error("Unhandled encoding encountered");
diff --git a/python/test/unit/tools/test_triton_to_gluon.py b/python/test/unit/tools/test_triton_to_gluon.py
@@ -13,7 +13,7 @@
 
 
 def convert_kernel(kernel, kernel_name, tmp_path):
-    converted = convert_triton_to_gluon(kernel)
+    converted = convert_triton_to_gluon([kernel])
 
     # Write converted kernel to a file so @gluon.jit can retrieve source
     mod_path = tmp_path / "converted_kernel.py"
@@ -52,7 +52,7 @@ def test_simple_kernel(tmp_path):
     ref = torch.empty_like(x)
     add_kernel[grid](x, y, ref, n, BLOCK)
 
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -85,7 +85,7 @@ def test_triton_to_gluon_dot_minimal(tmp_path):
 
     ref = torch.empty_like(c)
     matmul_tile_kernel[grid](a, b, ref, M, N, K, num_warps=8)
-    torch.testing.assert_close(c, ref)
+    torch.testing.assert_close(c, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -153,7 +153,7 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
     ref = torch.empty_like(output)
     matmul_kernel[grid](a, b, ref, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), output.stride(0),
                         output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K)
-    torch.testing.assert_close(output, ref)
+    torch.testing.assert_close(output, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -177,7 +177,7 @@ def test_triton_to_gluon_descriptor_roundtrip(tmp_path):
     y_ref = torch.zeros((M, N), device="cuda", dtype=torch.float16)
     desc_ref = TensorDescriptor(y_ref, y_ref.shape, y_ref.stride(), block_shape)
     descriptor_store_kernel[grid](desc_ref, M, N, 1.0)
-    torch.testing.assert_close(y, y_ref)
+    torch.testing.assert_close(y, y_ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -204,7 +204,7 @@ def test_triton_to_gluon_descriptor_load_roundtrip(tmp_path):
     y_ref = torch.zeros((M, N), device="cuda", dtype=torch.float16)
     desc_ref = TensorDescriptor(y_ref, y_ref.shape, y_ref.stride(), block_shape)
     descriptor_copy_kernel[grid](in_desc, desc_ref, M, N)
-    torch.testing.assert_close(y, y_ref)
+    torch.testing.assert_close(y, y_ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -232,7 +232,7 @@ def test_triton_reshape_trans(tmp_path):
     kernel[grid](x, y, out, n, BLOCK)
     ref = torch.empty_like(x)
     reshape_trans_kernel[grid](x, y, ref, n, BLOCK)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 BLOCK_SPLIT = tl.constexpr(256)
@@ -262,7 +262,7 @@ def test_split(tmp_path):
     kernel[grid](x, out)
     ref = torch.empty_like(x[:n])
     split_kernel[grid](x, ref)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
 
 
 @triton.jit
@@ -281,4 +281,23 @@ def test_reduce_to_scalar(tmp_path):
     kernel[grid](out)
     ref = torch.empty_like(out)
     reduce_to_scalar_kernel[grid](ref)
-    torch.testing.assert_close(out, ref)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
+
+
+@triton.jit
+def num_threads_kernel(out_ptr):
+    num_threads: tl.constexpr = tl.extra.cuda.num_threads()
+    offs = tl.arange(0, num_threads)
+    tl.store(out_ptr + offs, 1)
+
+
+@pytest.mark.skipif(not (is_blackwell()), reason="Requires Blackwell")
+def test_num_threads(tmp_path):
+    kernel = convert_kernel(num_threads_kernel, "num_threads_kernel", tmp_path)
+
+    num_threads = 256
+    out = torch.empty(num_threads, dtype=torch.int32, device="cuda")
+    kernel[(1, )](out, num_warps=num_threads // 32)
+    ref = torch.empty_like(out)
+    num_threads_kernel[(1, )](ref, num_warps=num_threads // 32)
+    torch.testing.assert_close(out, ref, atol=0, rtol=0)
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -9,6 +9,9 @@ def _realize_cta_layout(layout, rank):
     ctas_per_cga = layout.ctas_per_cga or [1] * rank
     cta_split_num = layout.cta_split_num or [1] * rank
     cta_order = layout.cta_order or list(reversed(range(rank)))
+    # Canonicalize CTA order to [n,n-1,...,0] if CTAsPerCGA is [1...1]. This matches logic in C++.
+    if all(num_cta == 1 for num_cta in ctas_per_cga):
+        cta_order = list(range(rank - 1, -1, -1))
     object.__setattr__(layout, "ctas_per_cga", ctas_per_cga)
     object.__setattr__(layout, "cta_split_num", cta_split_num)
     object.__setattr__(layout, "cta_order", cta_order)
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -416,6 +416,11 @@ def _check_same_layout(xs):
         _check(all(l == l0 for l in layouts[1:]),
                lambda: f"Expected inputs to have matching layouts, but got: {layouts}")
 
+    def _store_legacy(self, ptr, val, mask, boundary_check, cache, eviction):
+        if ptr.type.is_block() and not val.type.is_block():
+            val = self.splat(val, ptr.type.get_block_shapes(), ptr.type.layout)
+        return super()._store_legacy(ptr, val, mask, boundary_check, cache, eviction)
+
     def associative_scan(self, inputs: Sequence[TensorTy], axis: int, region_builder_fn,
                          reverse: bool) -> Tuple[TensorTy, ...]:
         shape = inputs[0].type.shape
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -68,6 +68,9 @@ def mangle(self) -> str:
         cta_split_str = (f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else "")
         return f"TL{block_str}{stride_str}{cta_split_str}TL"
 
+    def __hash__(self):
+        return hash((self.block, self.col_stride, self.cta_split_num))
+
 
 @dataclass(frozen=True, eq=True)
 class TensorMemoryScalesLayout:
@@ -91,6 +94,9 @@ def mangle(self) -> str:
         cta_split_str = f"CS{self.cta_split_num[0]}x{self.cta_split_num[1]}" if self.cta_split_num else ""
         return f"TLS{cta_split_str}TLS"
 
+    def __hash__(self):
+        return hash(self.cta_split_num)
+
 
 @constexpr_function
 def get_tmem_reg_layout(
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/tma.py b/python/triton/experimental/gluon/language/nvidia/hopper/tma.py
@@ -116,6 +116,7 @@ def make_tensor_descriptor(
     _semantic=None,
 ) -> tensor_descriptor:
     padding_option = _unwrap_if_constexpr(padding_option)
+    block_shape = _unwrap_if_constexpr(block_shape)
 
     ndim = len(shape)
     if not (1 <= ndim <= 5):
diff --git a/python/triton/tools/triton_to_gluon_translater/translator.py b/python/triton/tools/triton_to_gluon_translater/translator.py
@@ -159,7 +159,7 @@ def visit_Call(self, node: ast.Call) -> ast.AST:
             if resolved_callable is triton.language.core.static_range:
                 return self.forward_call(node, self.ttgl_attr("static_range"))
         else:
-            if isinstance(node.func, ast.Attribute) and node.func.attr in ["store", "load", "gather"]:
+            if isinstance(node.func, ast.Attribute) and node.func.attr in ["store", "load", "gather", "scatter"]:
                 helper_name = "tl_obj_" + node.func.attr
                 return ast.Call(
                     func=ast.Name(id=helper_name, ctx=ast.Load()),
@@ -378,10 +378,10 @@ def visit_Call(self, call_node: ast.Call) -> ast.AST:
     return results
 
 
-def convert_triton_to_gluon(src: triton.runtime.jit.JITCallable) -> str:
+def convert_triton_to_gluon(src: list[triton.runtime.jit.JITCallable]) -> str:
     """Convert a Triton JIT entry point into a Gluon source string."""
     shared_jit_set: set = set()
-    function_queue: list = [src]
+    function_queue: list = list(src)
     constexpr_globals: dict = {}
     out = ""
     # Process discovered callee JITFunctions, converting and appending them
diff --git a/python/triton/tools/triton_to_gluon_translater/translator_helpers.py b/python/triton/tools/triton_to_gluon_translater/translator_helpers.py
@@ -10,10 +10,32 @@
     tcgen05_mma_scaled,
     tcgen05_commit,
 )
-from triton.experimental.gluon.language.nvidia.hopper import tma
+from triton.experimental.gluon.language.nvidia.ampere import mma_v2
+from triton.experimental.gluon.language.nvidia.hopper import tma, fence_async_shared
 from triton.experimental.gluon.language.nvidia.blackwell import tma as tma_blackwell
 
 
+@gluon.jit
+def tl_dot_mma_sync(a, b, acc_init=None, input_precision=None):
+    mma_layout: ttgl.constexpr = ttgl.NVMMADistributedLayout(
+        version=[2, 0],
+        warps_per_cta=[ttgl.num_warps(), 1],
+        instr_shape=[16, 8],
+    )
+    a_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=mma_layout, operand_index=0, k_width=2)
+    b_layout: ttgl.constexpr = ttgl.DotOperandLayout(parent=mma_layout, operand_index=1, k_width=2)
+    a = ttgl.convert_layout(a, a_layout)
+    b = ttgl.convert_layout(b, b_layout)
+    if acc_init is not None:
+        acc = ttgl.convert_layout(acc_init, mma_layout)
+    else:
+        acc = ttgl.full([a.shape[0], a.shape[1], b.shape[2]], 0.0, ttgl.float32, layout=mma_layout)
+    result = mma_v2(a, b, acc, input_precision)
+    if acc is not None:
+        result = ttgl.convert_layout(result, acc_init.type.layout)
+    return result
+
+
 @gluon.constexpr_function
 def get_swizzle_byte_width(bitwidth):
     swizzle = min(bitwidth, 128)
@@ -22,8 +44,8 @@ def get_swizzle_byte_width(bitwidth):
 
 
 @gluon.jit
-def tl_dot(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None, out_dtype=ttgl.float32):
-    # TODO: check if MMAv5 cannot be used and fallback to mmav2
+def tl_dot_blackwell(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None,
+                     out_dtype=ttgl.float32):
     M: ttgl.constexpr = a.type.shape[0]
     N: ttgl.constexpr = b.type.shape[1]
     K: ttgl.constexpr = a.type.shape[1]
@@ -59,6 +81,19 @@ def tl_dot(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprec
     return out
 
 
+@gluon.jit
+def tl_dot(a, b, acc=None, input_precision=None, allow_tf32=None, max_num_imprecise_acc=None, out_dtype=ttgl.float32):
+    if ttgl.num_warps() < 4:
+        return tl_dot_mma_sync(a, b, acc, input_precision)
+    else:
+        return tl_dot_blackwell(a, b, acc, input_precision, allow_tf32, max_num_imprecise_acc, out_dtype)
+
+
+@gluon.constexpr_function
+def _constexpr_min(a, b):
+    return min(a, b)
+
+
 @gluon.jit
 def tl_dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, fast_math=False, lhs_k_pack=True,
                   rhs_k_pack=True, out_dtype=ttgl.float32):
@@ -114,9 +149,9 @@ def get_num_threads_per_warp() -> ttgl.constexpr:
     return ttgl.constexpr(32)
 
 
-@gluon.constexpr_function
-def get_num_threads_per_program():
-    return ttgl.num_warps() * get_num_threads_per_warp()
+@ttgl._core.builtin
+def get_num_threads_per_program(_semantic=None, _generator=None):
+    return ttgl.num_warps(_semantic=_semantic, _generator=_generator) * get_num_threads_per_warp(_semantic=_semantic)
 
 
 @gluon.constexpr_function
@@ -180,9 +215,32 @@ def tl_obj_gather(obj, x_offsets, y_offset):
         return obj.gather(x_offsets, y_offset)
 
 
+@gluon.jit
+def tl_obj_scatter(obj, value, x_offsets, y_offset):
+    if isinstance(obj, ttgl.nvidia.hopper.tma.tensor_descriptor):
+        desc = obj
+        desc_shape: ttgl.constexpr = [x_offsets.shape[0], desc.block_shape[1]]
+        alloc = ttgl.allocate_shared_memory(desc.dtype, desc_shape, desc.layout, value)
+        fence_async_shared()
+        x_offsets_layout: ttgl.constexpr = ttgl.SliceLayout(
+            0, ttgl.BlockedLayout([1, 4], [get_num_threads_per_warp(), 1], [1, ttgl.num_warps()], [1, 0]))
+        x_offsets = ttgl.convert_layout(x_offsets, x_offsets_layout)
+        tma_blackwell.async_scatter(desc, x_offsets, y_offset, alloc)
+        tma.store_wait(0)
+    else:
+        obj.scatter(value, x_offsets, y_offset)
+
+
+@ttgl._core.builtin
+def tl_make_tensor_descriptor(base, shape, strides, block_shape, padding_option="zero", _semantic=None):
+    layout = ttgl.NVMMASharedLayout.get_default_for(block_shape, base.dtype.element_ty)
+    return tma.make_tensor_descriptor(base, shape, strides, block_shape, layout, padding_option, _semantic=_semantic)
+
+
 @gluon.jit
 def tl_store_tensor_descriptor(desc, offsets, value):
     alloc = ttgl.allocate_shared_memory(desc.dtype, desc.block_shape, desc.layout, value)
+    fence_async_shared()
     tma.async_copy_shared_to_global(desc, offsets, alloc)
     tma.store_wait(0)
     alloc._keep_alive()