[Gluon] Fix linear layout MLIR->Python; fix CTA layout equality (#7230)

Mogball · web-flow · commit 40c9b1c04d91 · 2025-06-18T22:13:14.000Z
This PR makes layouts always materialize their CTA layouts so that
`BlockedLayout([1], [32], [4], [0]) == BlockedLayout([1], [32], [4],
[0], [1], [1], [0])`. This is important especially since layouts raised
from MLIR always have CTA layouts attached.
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -130,7 +130,7 @@ py::object layoutToGluon(Attribute layout) {
     return layouts.DistributedLinearLayout(
         ll.getBases().lookup(kReg), ll.getBases().lookup(kLane),
         ll.getBases().lookup(kWarp), ll.getBases().lookup(kBlock),
-        ll.getOutDimSizes());
+        toStdVector(ArrayRef(llvm::to_vector(ll.getOutDimSizes()))));
   } else if (auto nvmma = dyn_cast<ttg::NVMMASharedEncodingAttr>(layout)) {
     auto ctaLayout = nvmma.getCTALayout();
     return layouts.NVMMASharedLayout(
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -246,8 +246,7 @@ def shared_memory_cast_kernel():
     layout_a: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False, element_bitwidth=8,
                                                       rank=2)
     layout_T: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=True, element_bitwidth=8,
-                                                      rank=2, ctas_per_cga=[1, 1], cta_split_num=[1,
-                                                                                                  1], cta_order=[1, 0])
+                                                      rank=2)
     smem = ttgl.allocate_shared_memory(ttgl.int8, [2, 256, 128], layout_a)
     perm = smem.index(0).permute((1, 0))
     ttgl.static_assert(perm.type.layout == layout_T)
@@ -613,10 +612,10 @@ def kernel():
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @kernel() attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<32x32xi32, #shared, #smem, mutable>
-    tt.call @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
+    tt.call @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0_1_1_1_1_1_0_SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=_1, 1_, cta_split_num=_1, 1_, cta_order=_1, 0_)_"(%0) : (!ttg.memdesc<32x32xi32, #shared, #smem, mutable>) -> ()
     tt.return
   }
-  tt.func private @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0____SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=None, cta_split_num=None, cta_order=None)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
+  tt.func private @"test_frontend.smem_and_layout_user__MDi32S32_32SLSSS_1_1_1_1_0_1_1_1_1_1_0_SSSLAS[32, 32]ASMD__(1,)cconstexpr_SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1, order=(1 ,0), ctas_per_cga=_1, 1_, cta_split_num=_1, 1_, cta_order=_1, 0_)_"(%arg0: !ttg.memdesc<32x32xi32, #shared, #smem, mutable>) attributes {noinline = false} {
     tt.return
   }
 }
@@ -855,7 +854,7 @@ def test_tensor_permute():
     a = ttgl.full([32, 16], 0, ttgl.int32, layout=layout)
     # CHECK: tt.trans{{.*}} : tensor<32x16xi32, [[BLOCKED]]> -> tensor<16x32xi32, [[BLOCKED1]]>
     res = ttgl.permute(a, [1, 0])
-    permuted_layout: ttgl.constexpr = ttgl.BlockedLayout([2, 1], [8, 4], [1, 4], [0, 1], [1, 1], [1, 1], [1, 0])
+    permuted_layout: ttgl.constexpr = ttgl.BlockedLayout([2, 1], [8, 4], [1, 4], [0, 1])
     ttgl.static_assert(permuted_layout == res.type.layout)
 
 
@@ -869,7 +868,7 @@ def test_split_join():
     b = ttgl.full([128], 2, ttgl.int32, layout)
     # CHECK: tt.join {{.*}} : tensor<128xi32, [[BLOCKED]]> -> tensor<128x2xi32, [[BLOCKED1]]>
     res = ttgl.join(a, b)
-    expect_layout: ttgl.constexpr = ttgl.BlockedLayout([2, 2], [32, 1], [4, 1], [1, 0], [1, 1], [1, 1], [1, 0])
+    expect_layout: ttgl.constexpr = ttgl.BlockedLayout([2, 2], [32, 1], [4, 1], [1, 0])
     ttgl.static_assert(res.type.layout == expect_layout)
 
     # CHECK: tt.split {{.*}} : tensor<128x2xi32, [[BLOCKED1]]> -> tensor<128xi32, [[BLOCKED]]>
@@ -878,6 +877,17 @@ def test_split_join():
     ttgl.static_assert(d.type.layout == layout)
 
 
+@filecheck_test
+@gluon.jit
+def test_reshape_linear_layout():
+    # CHECK: [[BLOCKED:#.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+    # CHECK: [[LINEAR:#.*]] = #ttg.linear
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [32, 1], [4, 1], [0, 1])
+    x = ttgl.full([128, 1], 1, ttgl.int32, layout=layout)
+    # CHECK: tt.reshape %{{.*}} : tensor<128x1xi32, [[BLOCKED]]> -> tensor<128xi32, [[LINEAR]]>
+    x.reshape([128])
+
+
 @filecheck_test
 @gluon.jit
 def test_tensor_reshape():
@@ -887,8 +897,7 @@ def test_tensor_reshape():
     a = ttgl.full([256], 1, ttgl.int32, layout)
     # CHECK: tt.reshape {{.*}} : tensor<256xi32, [[BLOCKED]]> -> tensor<8x4x8xi32, [[BLOCKED1]]>
     v = a.reshape([8, 4, 8])
-    expect_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1, 2], [2, 4, 4], [4, 1, 1], [2, 1, 0], [1, 1, 1], [1, 1, 1],
-                                                       [2, 1, 0])
+    expect_layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1, 2], [2, 4, 4], [4, 1, 1], [2, 1, 0])
     ttgl.static_assert(v.type.layout == expect_layout)
 
 
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -11,11 +11,13 @@
 ]
 
 
-def _realize_cta_layout(rank, ctas_per_cga, cta_split_num, cta_order):
-    ctas_per_cga = ctas_per_cga or [1] * rank
-    cta_split_num = cta_split_num or [1] * rank
-    cta_order = cta_order or list(reversed(range(rank)))
-    return ctas_per_cga, cta_split_num, cta_order
+def _realize_cta_layout(layout, rank):
+    ctas_per_cga = layout.ctas_per_cga or [1] * rank
+    cta_split_num = layout.cta_split_num or [1] * rank
+    cta_order = layout.cta_order or list(reversed(range(rank)))
+    object.__setattr__(layout, "ctas_per_cga", ctas_per_cga)
+    object.__setattr__(layout, "cta_split_num", cta_split_num)
+    object.__setattr__(layout, "cta_order", cta_order)
 
 
 class DistributedLayout:
@@ -42,25 +44,23 @@ def __post_init__(self):
         super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
 
         rank = len(self.size_per_thread)
+        _realize_cta_layout(self, rank)
         assert len(self.threads_per_warp) == rank
         assert len(self.warps_per_cta) == rank
         assert len(self.order) == rank
-        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
-        assert self.cta_split_num is None or len(self.cta_split_num) == rank
-        assert self.cta_order is None or len(self.cta_order) == rank
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
 
     def _to_ir(self, builder):
-        rank = len(self.size_per_thread)
-        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(rank, self.ctas_per_cga, self.cta_split_num,
-                                                                     self.cta_order)
         return builder.get_blocked_layout(
             self.size_per_thread,
             self.threads_per_warp,
             self.warps_per_cta,
             self.order,
-            ctas_per_cga,
-            cta_split_num,
-            cta_order,
+            self.ctas_per_cga,
+            self.cta_split_num,
+            self.cta_order,
         )
 
     def mangle(self) -> str:
@@ -161,21 +161,20 @@ def __post_init__(self):
         assert self.element_bitwidth in [8, 16, 32, 64]
         assert self.swizzle_byte_width in [0, 32, 64, 128]
         rank = self.rank
-        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
-        assert self.cta_split_num is None or len(self.cta_split_num) == rank
-        assert self.cta_order is None or len(self.cta_order) == rank
+        _realize_cta_layout(self, rank)
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
 
     def _to_ir(self, builder):
-        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(self.rank, self.ctas_per_cga, self.cta_split_num,
-                                                                     self.cta_order)
         return builder.get_nvmma_shared_layout(
             self.swizzle_byte_width,
             self.element_bitwidth,
             self.transposed,
             self.fp4_padded,
-            ctas_per_cga,
-            cta_split_num,
-            cta_order,
+            self.ctas_per_cga,
+            self.cta_split_num,
+            self.cta_order,
         )
 
     def mangle(self) -> str:
@@ -202,22 +201,20 @@ def __post_init__(self):
         super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
 
         rank = len(self.order)
-        assert self.ctas_per_cga is None or len(self.ctas_per_cga) == rank
-        assert self.cta_split_num is None or len(self.cta_split_num) == rank
-        assert self.cta_order is None or len(self.cta_order) == rank
+        _realize_cta_layout(self, rank)
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
 
     def _to_ir(self, builder):
-        rank = len(self.order)
-        ctas_per_cga, cta_split_num, cta_order = _realize_cta_layout(rank, self.ctas_per_cga, self.cta_split_num,
-                                                                     self.cta_order)
         return builder.get_swizzled_shared_layout(
-            _unwrap_if_constexpr(self.vec),
-            _unwrap_if_constexpr(self.per_phase),
-            _unwrap_if_constexpr(self.max_phase),
+            self.vec,
+            self.per_phase,
+            self.max_phase,
             self.order,
-            ctas_per_cga,
-            cta_split_num,
-            cta_order,
+            self.ctas_per_cga,
+            self.cta_split_num,
+            self.cta_order,
         )
 
     def mangle(self) -> str:
diff --git a/python/triton/language/__init__.py b/python/triton/language/__init__.py
@@ -287,8 +287,8 @@ def str_to_ty(name):
 
     if name.startswith("tensordesc"):
         inner = name.split("<")[1].rstrip(">")
-        dtype, rest = inner.split("[", maxsplit=2)
-        block_shape, rest = rest.split("]", maxsplit=2)
+        dtype, rest = inner.split("[", maxsplit=1)
+        block_shape, rest = rest.split("]", maxsplit=1)
         block_shape = [int(s.strip()) for s in block_shape.rstrip("]").split(",")]
         layout = rest.lstrip(",")
         is_gluon = len(layout)