[GLUON] Allow TensorMemory layouts in to_linear_layout in the context of printing. (#8682)

lezcano · web-flow · commit 29009f1b136b · 2025-11-22T00:00:47.000Z
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -375,8 +375,47 @@ void init_gluon_ir(py::module &&m) {
               std::vector<int64_t> &shape) -> py::object {
              auto ctx = self.getContext();
              auto linearLayout = ttg::toLinearLayout(shape, layout);
-             auto attr = ttg::LinearEncodingAttr::get(ctx, linearLayout);
-             return layoutToGluon(attr);
+
+             if (isa<ttg::DistributedEncodingTrait>(layout)) {
+               auto attr = ttg::LinearEncodingAttr::get(ctx, linearLayout);
+               return layoutToGluon(attr);
+             }
+             if (isa<ttg::SharedEncodingTrait>(layout)) {
+               auto alignment =
+                   cast<ttg::SharedEncodingTrait>(layout).getAlignment();
+               auto attr = ttg::SharedLinearEncodingAttr::get(ctx, linearLayout,
+                                                              alignment);
+               return layoutToGluon(attr);
+             }
+
+             // TensorMemory encodings: keep the LinearLayout but wrap as
+             // print-only Python object carrying row/col bases -> dim0/dim1.
+             auto inNamesRange = linearLayout.getInDimNames();
+             auto inNames = llvm::to_vector(inNamesRange);
+             bool isTmemLayout =
+                 (inNames.size() == 2 && inNames[0].str() == "row" &&
+                  inNames[1].str() == "col");
+             if (!isTmemLayout)
+               throw std::invalid_argument(
+                   "Unsupported layout in to_linear_layout");
+
+             // Build Py _TensorMemoryLinearLayout(row_bases, col_bases, shape,
+             // repr)
+             py::object tmemCls =
+                 py::module::import(
+                     "triton.experimental.gluon.language.nvidia.blackwell")
+                     .attr("_TensorMemoryLinearLayout");
+             auto bases = linearLayout.getBases();
+             auto rowBases = bases[mlir::StringAttr::get(ctx, "row")];
+             auto colBases = bases[mlir::StringAttr::get(ctx, "col")];
+             auto outDims = linearLayout.getOutDims();
+             std::vector<int> shapeVec;
+             for (auto &od : outDims)
+               shapeVec.push_back(od.second);
+
+             py::object pyObj = tmemCls(py::cast(rowBases), py::cast(colBases),
+                                        py::cast(shapeVec));
+             return pyObj;
            })
       .def("get_dot_operand_layout",
            [](GluonOpBuilder &self, unsigned opIdx, Attribute parent,
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -1461,48 +1461,29 @@ def kernel(reg_type: ttgl.constexpr, shared_type: ttgl.constexpr, ref_conflicts:
 
 
 @pytest.mark.parametrize(
-    "layout, expected",
+    "layout, shape",
     [
-        (
-            ttgl.BlockedLayout([1], [4], [4], [0]),
-            ttgl.DistributedLinearLayout(
-                reg_bases=[],
-                lane_bases=[[1], [2]],
-                warp_bases=[[4], [8]],
-                block_bases=[],
-                shape=[16],
-            ),
-        ),
-        (
-            ttgl.BlockedLayout([1], [4], [4], [0], [[1], [0]]),
-            ttgl.DistributedLinearLayout(
-                reg_bases=[],
-                lane_bases=[[1], [2]],
-                warp_bases=[[4], [8]],
-                block_bases=[[16], [0]],
-                shape=[32],
-            ),
-        ),
-        (
-            ttgl.BlockedLayout([8, 1], [8, 4], [1, 4], [0, 1], [[0, 1]]),
-            ttgl.DistributedLinearLayout(
-                reg_bases=[[1, 0], [2, 0], [4, 0], [0, 16], [0, 32]],
-                lane_bases=[[8, 0], [16, 0], [32, 0], [0, 1], [0, 2]],
-                warp_bases=[[0, 4], [0, 8]],
-                block_bases=[[0, 64]],
-                shape=[64, 128],
-            ),
-        ),
+        (ttgl.BlockedLayout([1], [4], [4], [0]), [16]),
+        (ttgl.BlockedLayout([1], [4], [4], [0], [[1], [0]]), [32]),
+        (ttgl.BlockedLayout([8, 1], [8, 4], [1, 4], [0, 1], [[0, 1]]), [64, 128]),
+        (ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=16, rank=2), [64, 64]),
+        (TensorMemoryLayout((64, 64), col_stride=2), [64, 64]),
     ],
 )
-def test_to_linear_layout(layout, expected):
+def test_to_linear_layout(layout, shape, capsys):
 
     @gluon.jit
-    def kernel(layout: ttgl.constexpr, expected: ttgl.constexpr, shape: ttgl.constexpr):
+    def kernel(layout: ttgl.constexpr, shape: ttgl.constexpr):
         computed: ttgl.constexpr = ttgl.to_linear_layout(layout, shape)
-        ttgl.static_assert(computed == expected)
-
-    run_parser(kernel, args=(layout, expected, tuple(expected.shape)), target=AMPERE_TARGET)
+        ttgl.static_print(computed)
+
+    run_parser(kernel, args=(layout, tuple(shape)), target=AMPERE_TARGET)
+    out = capsys.readouterr().out
+    if isinstance(layout, TensorMemoryLayout):
+        assert "rows=" in out
+        assert "cols=" in out
+    else:
+        assert "DistributedLinearLayout" in out or "SharedLinearLayout" in out
 
 
 @filecheck_test
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass, field
+import itertools
 from typing import List
 
 from triton.language.core import _unwrap_if_constexpr, _unwrap_shape, constexpr_type
@@ -636,6 +637,15 @@ def _to_ir(self, builder):
     def mangle(self) -> str:
         return f"SharedLinear_{self.offset_bases}_{self.block_bases}_{self.alignment}_SharedLinear"
 
+    @property
+    def shape(self):
+        rank = len(self.offset_bases[0])
+        max_stride = [1] * rank
+        for b in itertools.chain(self.offset_bases, self.block_bases):
+            for i, bi in enumerate(b):
+                max_stride[i] = max(max_stride[i], bi)
+        return [2 * s for s in max_stride]
+
     def __hash__(self):
         return hash((
             tuple(map(tuple, self.offset_bases)),
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -2,7 +2,7 @@
 import math
 from triton.language.semantic import TritonSemantic
 from . import _core as ttgl
-from ._layouts import AutoLayout, DistributedLayout, DistributedLinearLayout, SliceLayout, SharedLayout, CoalescedLayout
+from ._layouts import AutoLayout, DistributedLayout, DistributedLinearLayout, SliceLayout, SharedLayout, CoalescedLayout, SharedLinearLayout
 from triton._C.libtriton.gluon_ir import GluonOpBuilder, compute_tmem_reg_layout
 from triton.compiler.code_generator import flatten_values_to_ir, unflatten_ir_values
 
@@ -301,15 +301,16 @@ def bank_conflicts(self, distr_ty, shared_ty):
                                                       distr_ty.element_ty.primitive_bitwidth)
 
     def to_linear_layout(self, layout, shape):
-        _check(isinstance(layout, (DistributedLayout, SharedLayout)),
-               lambda: f"Expected a DistributedLayout or SharedLayout, got {type(layout)}")
-
-        if not isinstance(shape, list):
-            shape = list(shape)
-
-        layout = ttgl._unwrap_if_constexpr(layout)
+        from triton.experimental.gluon.language.nvidia.blackwell import (
+            TensorMemoryLayout,
+            TensorMemoryScalesLayout,
+        )
+        _check(
+            isinstance(layout, (DistributedLayout, SharedLayout, TensorMemoryLayout, TensorMemoryScalesLayout)), lambda:
+            f"Expected a DistributedLayout, SharedLayout, or TensorMemoryLayout or TensorMemoryScalesLayout, got {type(layout)}"
+        )
 
-        if isinstance(layout, (AutoLayout, DistributedLinearLayout)):
+        if isinstance(layout, (AutoLayout, DistributedLinearLayout, SharedLinearLayout)):
             return ttgl.constexpr(layout)
 
         return ttgl.constexpr(self.builder.to_linear_layout(layout._to_ir(self.builder), shape))
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -2,6 +2,7 @@
 from typing import Optional, Tuple, List, TYPE_CHECKING
 
 from dataclasses import dataclass
+import itertools
 from triton.runtime.jit import constexpr_function
 from triton.experimental.gluon.language import _core as ttgl
 from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
@@ -26,7 +27,9 @@
     "mma_v2",
     "tensor_memory_descriptor",
     "TensorMemoryLayout",
+    "TensorMemoryScalesLayout",
     "tma",
+    "_TensorMemoryLinearLayout",
 ]
 
 
@@ -104,6 +107,25 @@ def __hash__(self):
         return hash(self.cta_split_num)
 
 
+@dataclass(frozen=True)
+class _TensorMemoryLinearLayout:
+    """
+    Print-only linear layout for TMEM (row/col -> dim0/dim1).
+    """
+    rows: List[List[int]]
+    cols: List[List[int]]
+    shape: List[int]
+
+    def _to_ir(self, builder):
+        raise RuntimeError("TensorMemoryLinearLayout is print-only; IR materialization is unsupported")
+
+    def mangle(self):
+        return f"TMLL_{self.shape}_TMLL"
+
+    def __hash__(self):
+        return hash((tuple(map(tuple, self.rows)), tuple(map(tuple, self.cols)), tuple(self.shape)))
+
+
 @constexpr_function
 def get_tmem_reg_layout(
         element_ty,