[AMD][GLUON] Turn select scale layout into constexpr function (#8673)

borontion · web-flow · commit 4f8712b646b4 · 2025-11-07T21:07:45.000-08:00
Following #8496, this PR changes `get_wmma_scale_layout` /
`get_mfma_scale_layout` into `constexpr_function`.
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -406,16 +406,6 @@ void init_gluon_ir(py::module &&m) {
                  ctx, version, warpsPerCta, instrShape, transposed, ctaLayout,
                  tilesPerWarp, elementBitWidth);
            })
-      .def("get_amd_mfma_scale_layout",
-           [](GluonOpBuilder &self, unsigned opIdx, std::vector<int64_t> &shape,
-              unsigned mfmaMDim, std::vector<unsigned> &tilesPerWarp,
-              std::vector<unsigned> &warpsPerCTA) -> py::object {
-             auto ctx = self.getContext();
-             auto ll = ttg::chooseScaledMfmaScaleLayout(
-                 ctx, opIdx, shape, mfmaMDim, tilesPerWarp, warpsPerCTA);
-             auto attr = ttg::LinearEncodingAttr::get(ctx, ll);
-             return layoutToGluon(attr);
-           })
       .def("get_amd_wmma_layout",
            [](GluonOpBuilder &self, unsigned version, bool transposed,
               std::vector<unsigned> &warpsPerCta,
@@ -431,16 +421,6 @@ void init_gluon_ir(py::module &&m) {
                                                   warpsPerCta, tilesPerWarp,
                                                   ctaLayout, instrShape);
            })
-      .def("get_amd_wmma_scale_layout",
-           [](GluonOpBuilder &self, unsigned opIdx, std::vector<int64_t> &shape,
-              unsigned mfmaMDim, std::vector<unsigned> &tilesPerWarp,
-              std::vector<unsigned> &warpsPerCTA) -> py::object {
-             auto ctx = self.getContext();
-             auto ll = ttg::chooseScaledWmmaScaleLayout(
-                 ctx, opIdx, shape, mfmaMDim, tilesPerWarp, warpsPerCTA);
-             auto attr = ttg::LinearEncodingAttr::get(ctx, ll);
-             return layoutToGluon(attr);
-           })
       .def("get_padded_shared_layout",
            [](GluonOpBuilder &self, std::vector<unsigned> &intervals,
               std::vector<unsigned> &paddings,
@@ -913,6 +893,40 @@ void init_gluon_ir(py::module &&m) {
         return layoutToGluon(attr);
       });
 
+  m.def("get_amd_mfma_scale_layout",
+        [](unsigned opIdx, std::vector<int64_t> &shape, unsigned mfmaMDim,
+           std::vector<unsigned> &tilesPerWarp,
+           std::vector<unsigned> &warpsPerCTA) -> py::object {
+          DialectRegistry registry;
+          registry.insert<triton::TritonDialect, ttg::TritonGPUDialect,
+                          ttng::TritonNvidiaGPUDialect, gluon::GluonDialect>();
+          MLIRContext ctx(MLIRContext::Threading::DISABLED);
+          ctx.appendDialectRegistry(registry);
+          ctx.loadAllAvailableDialects();
+
+          auto ll = ttg::chooseScaledMfmaScaleLayout(
+              &ctx, opIdx, shape, mfmaMDim, tilesPerWarp, warpsPerCTA);
+          auto attr = ttg::LinearEncodingAttr::get(&ctx, ll);
+          return layoutToGluon(attr);
+        });
+
+  m.def("get_amd_wmma_scale_layout",
+        [](unsigned opIdx, std::vector<int64_t> &shape, unsigned wmmaMDim,
+           std::vector<unsigned> &tilesPerWarp,
+           std::vector<unsigned> &warpsPerCTA) -> py::object {
+          DialectRegistry registry;
+          registry.insert<triton::TritonDialect, ttg::TritonGPUDialect,
+                          ttng::TritonNvidiaGPUDialect, gluon::GluonDialect>();
+          MLIRContext ctx(MLIRContext::Threading::DISABLED);
+          ctx.appendDialectRegistry(registry);
+          ctx.loadAllAvailableDialects();
+
+          auto ll = ttg::chooseScaledWmmaScaleLayout(
+              &ctx, opIdx, shape, wmmaMDim, tilesPerWarp, warpsPerCTA);
+          auto attr = ttg::LinearEncodingAttr::get(&ctx, ll);
+          return layoutToGluon(attr);
+        });
+
   py::class_<ttg::WarpSpecializeOp, OpState>(m, "WarpSpecializeOp",
                                              py::module_local())
       .def("get_default_region", &ttg::WarpSpecializeOp::getDefaultRegion,
diff --git a/python/triton/experimental/gluon/language/amd/_ops.py b/python/triton/experimental/gluon/language/amd/_ops.py
@@ -1,3 +1,5 @@
+import math
+
 from triton import knobs
 from triton.experimental.gluon.language import _core as ttgl
 from triton.experimental.gluon.language._semantic import _check
@@ -57,13 +59,13 @@ def _create_and_broadcast_default_scale(op_idx, scale, format):
         operand = a if op_idx == 0 else b
 
         scale_shape = _get_scale_shape(op_idx, operand, format)
-        scale_layout = scale_fn(operand.type.layout, scale_shape, semantic)
-
         if isinstance(scale, ttgl.tensor) and scale.numel.value != 1:
-            assert scale.type.shape == scale_shape, \
-                f"Expect scale tensor to have shape {scale_shape}, but got {scale.type.shape}"
+            # In the case of scale pre-shuffling, the input shape is different from the default shape. We only check
+            # the number of elements here.
+            assert math.prod(scale_shape) == scale.numel.value, "Incompatible scale shape"
             return scale
 
+        scale_layout = scale_fn(operand.type.layout, scale_shape)
         scale_value = _unwrap_if_constexpr(scale)
         scale_value = 0x7F if scale_value is None else scale_value
         return semantic.full(scale_shape, scale_value, ttgl.uint8, scale_layout)
diff --git a/python/triton/experimental/gluon/language/amd/cdna4/__init__.py b/python/triton/experimental/gluon/language/amd/cdna4/__init__.py
@@ -1,4 +1,7 @@
-from ..._core import builtin, _unwrap_if_constexpr
+from triton.runtime.jit import constexpr_function
+from triton._C.libtriton.gluon_ir import get_amd_mfma_scale_layout as _get_mfma_scale_layout
+
+from ..._core import builtin
 from ..._layouts import DotOperandLayout
 from .._layouts import AMDMFMALayout
 from .._ops import _mma_scaled
@@ -10,19 +13,6 @@
 __all__ = [*__cdna3_all, "async_copy", "mfma_scaled", "get_mfma_scale_layout"]
 
 
-def _get_mfma_scale_layout(dot_operand_layout, shape, semantic):
-    dot_operand_layout = _unwrap_if_constexpr(dot_operand_layout)
-    shape = _unwrap_if_constexpr(shape)
-
-    op_idx = dot_operand_layout.operand_index
-    parent = dot_operand_layout.parent
-    assert isinstance(parent, AMDMFMALayout), "Expected parent to be an instance of AMDMFMALayout"
-    mdim = parent.instr_shape[0]
-    tiles_per_warp = parent.tiles_per_warp
-    warps_per_cta = parent.warps_per_cta
-    return semantic.builder.get_amd_mfma_scale_layout(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)
-
-
 @builtin
 def mfma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None):
     """
@@ -56,11 +46,11 @@ def mfma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None)
     assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
     assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
 
-    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _get_mfma_scale_layout, _semantic)
+    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, get_mfma_scale_layout, _semantic)
 
 
-@builtin
-def get_mfma_scale_layout(dot_operand_layout, shape, _semantic=None):
+@constexpr_function
+def get_mfma_scale_layout(dot_operand_layout, shape):
     """ Get the scale layout for MFMA scaled operands.
 
     Args:
@@ -70,7 +60,13 @@ def get_mfma_scale_layout(dot_operand_layout, shape, _semantic=None):
     Return:
         layout (DistributedLinearLayout): The scale layout.
     """
-    return _get_mfma_scale_layout(dot_operand_layout, shape, _semantic)
+    op_idx = dot_operand_layout.operand_index
+    parent = dot_operand_layout.parent
+    assert isinstance(parent, AMDMFMALayout), "Expected parent to be an instance of AMDMFMALayout"
+    mdim = parent.instr_shape[0]
+    tiles_per_warp = parent.tiles_per_warp
+    warps_per_cta = parent.warps_per_cta
+    return _get_mfma_scale_layout(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)
 
 
 """
diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py b/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py
@@ -1,4 +1,7 @@
-from ..._core import builtin, _unwrap_if_constexpr
+from triton.runtime.jit import constexpr_function
+from triton._C.libtriton.gluon_ir import get_amd_wmma_scale_layout as _get_wmma_scale_layout
+
+from ..._core import builtin
 from .._ops import _wmma, _verify_wmma, _mma_scaled
 from .._layouts import AMDWMMALayout
 from ..cdna3 import buffer_load, buffer_store
@@ -8,19 +11,6 @@
 __all__ = ["async_copy", "tdm", "wmma", "wmma_scaled", "buffer_load", "buffer_store", "get_wmma_scale_layout"]
 
 
-def _get_wmma_scale_layout(dot_operand_layout, shape, semantic):
-    dot_operand_layout = _unwrap_if_constexpr(dot_operand_layout)
-    shape = _unwrap_if_constexpr(shape)
-
-    op_idx = dot_operand_layout.operand_index
-    parent = dot_operand_layout.parent
-    assert isinstance(parent, AMDWMMALayout), "Expected parent to be an instance of AMDMFMALayout"
-    mdim = parent.instr_shape[0]
-    tiles_per_warp = parent.tiles_per_warp
-    warps_per_cta = parent.warps_per_cta
-    return semantic.builder.get_amd_wmma_scale_layout(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)
-
-
 @builtin
 def wmma(a, b, acc, _semantic=None):
     """
@@ -73,11 +63,11 @@ def wmma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None)
     assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
     assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
 
-    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _get_wmma_scale_layout, _semantic)
+    return _mma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, get_wmma_scale_layout, _semantic)
 
 
-@builtin
-def get_wmma_scale_layout(dot_operand_layout, shape, _semantic=None):
+@constexpr_function
+def get_wmma_scale_layout(dot_operand_layout, shape):
     """ Get the scale layout for WMMA scaled operands.
 
     Args:
@@ -87,4 +77,10 @@ def get_wmma_scale_layout(dot_operand_layout, shape, _semantic=None):
     Return:
         layout (DistributedLinearLayout): The scale layout.
     """
-    return _get_wmma_scale_layout(dot_operand_layout, shape, _semantic)
+    op_idx = dot_operand_layout.operand_index
+    parent = dot_operand_layout.parent
+    assert isinstance(parent, AMDWMMALayout), "Expected parent to be an instance of AMDMFMALayout"
+    mdim = parent.instr_shape[0]
+    tiles_per_warp = parent.tiles_per_warp
+    warps_per_cta = parent.warps_per_cta
+    return _get_wmma_scale_layout(op_idx, shape, mdim, tiles_per_warp, warps_per_cta)