[Gluon] Add NVMMASharedLayout constructor with default swizzle choice (#7534)

peterbell10 · web-flow · commit b64e85b64d1e · 2025-07-16T16:13:01.000Z
This mirrors the attribute builder here: https://github.com/triton-lang/triton/blob/1031dc78060fc5f63c3fbcdd04d495d2428bc862/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td#L480
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -233,6 +233,15 @@ def type(self):
         return constexpr_type(self)
 
 
+def _get_shape_per_cta(shape, cta_split_num):
+    shape_per_cta = shape
+    if cta_split_num is not None:
+        assert len(cta_split_num) == len(shape)
+        for dim in range(len(shape_per_cta)):
+            shape_per_cta[dim] /= cta_split_num[dim]
+    return shape_per_cta
+
+
 @dataclass(frozen=True)
 class NVMMASharedLayout(SharedLayout):
     """
@@ -286,6 +295,47 @@ def _to_ir(self, builder):
             self.cta_order,
         )
 
+    @staticmethod
+    def get_default_for(block_shape, dtype, transposed=False, fp4_padded=False, ctas_per_cga=None, cta_split_num=None,
+                        cta_order=None):
+        """Returns an NVMMASharedLayout with default swizzling for a given shape.
+
+        This picks the largest swizzle pattern compatible with the shape, which
+        allows emitting the fewest TMA or MMA messages.
+        """
+        packing_factor = 2 if fp4_padded else 1
+        shape_per_cta = _get_shape_per_cta(block_shape, cta_split_num)
+        rank = len(block_shape)
+        if transposed:
+            shape_per_cta = shape_per_cta[1:] + shape_per_cta[:1]
+        contig_dim_size = shape_per_cta[-1] * packing_factor
+        contig_dim_bytes = contig_dim_size * dtype.primitive_bitwidth // 8
+        if contig_dim_bytes >= 128 and contig_dim_bytes % 128 == 0:
+            swizzle_byte_width = 128
+        elif contig_dim_bytes >= 64 and contig_dim_bytes % 64 == 0:
+            swizzle_byte_width = 64
+        elif contig_dim_bytes >= 32 and contig_dim_bytes % 32 == 0:
+            swizzle_byte_width = 32
+        else:
+            swizzle_byte_width = 0
+
+        flatten_outer_dim = 1
+        for size in shape_per_cta[:-1]:
+            flatten_outer_dim *= size
+        if len(block_shape) < 2 or flatten_outer_dim < 8:
+            swizzle_byte_width = 0
+
+        return NVMMASharedLayout(
+            swizzle_byte_width=swizzle_byte_width,
+            element_bitwidth=dtype.primitive_bitwidth,
+            rank=rank,
+            transposed=transposed,
+            fp4_padded=fp4_padded,
+            ctas_per_cga=ctas_per_cga,
+            cta_split_num=cta_split_num,
+            cta_order=cta_order,
+        )
+
     def mangle(self) -> str:
         return f"NVMMA_{self.swizzle_byte_width}_{self.element_bitwidth}_{self.transposed}_{self.fp4_padded}_NVMMA"
 
diff --git a/python/tutorials/gluon/01-attention-forward.py b/python/tutorials/gluon/01-attention-forward.py
@@ -68,36 +68,6 @@ def get_mma_instr_shape(shape, element_ty):
     return (m, n, k)
 
 
-@gl.constexpr_function
-def get_nvmma_layout(shape, element_ty, order=[1, 0], fp4_padded=False):
-    packing_factor = 2 if fp4_padded else 1
-
-    contig_dim_size = shape[order[0]] * packing_factor * element_ty.primitive_bitwidth // 8
-    if contig_dim_size >= 128 and contig_dim_size % 128 == 0:
-        swizzle_byte_width = 128
-    elif contig_dim_size >= 64 and contig_dim_size % 64 == 0:
-        swizzle_byte_width = 64
-    elif contig_dim_size >= 32 and contig_dim_size % 32 == 0:
-        swizzle_byte_width = 32
-    else:
-        swizzle_byte_width = 0
-
-    flatten_outer_dim = 1
-    for i in range(1, len(shape)):
-        flatten_outer_dim *= shape[order[i]]
-    if len(shape) < 2 or flatten_outer_dim < 8:
-        swizzle_byte_width = 0
-    transposed = order[0] == 0
-
-    return gl.NVMMASharedLayout(
-        swizzle_byte_width=swizzle_byte_width,
-        element_bitwidth=element_ty.primitive_bitwidth,
-        rank=len(shape),
-        transposed=transposed,
-        fp4_padded=fp4_padded,
-    )
-
-
 @gl.constexpr_function
 def get_mma_reg_layout(shape, num_warps, dtype=gl.float32):
     instr_shape = get_mma_instr_shape(shape, dtype)
@@ -995,8 +965,8 @@ def torch_dtype_to_triton(dtype):
 
 
 def make_tensor_desc(x, shape, strides, block_shape):
-    layout = get_nvmma_layout(block_shape, torch_dtype_to_triton(x.dtype))
-    return TensorDescriptor(x, shape=shape, strides=strides, block_shape=block_shape, layout=layout.value)
+    layout = gl.NVMMASharedLayout.get_default_for(block_shape, torch_dtype_to_triton(x.dtype))
+    return TensorDescriptor(x, shape=shape, strides=strides, block_shape=block_shape, layout=layout)
 
 
 def attention_forward(q, k, v, causal, sm_scale):