[AMD][GLUON] Expose MFMA layout (#7653)

zwu-2025 · peterbell10 · web-flow · commit d0166919ccc3 · 2025-08-01T17:03:43.000Z
This PR is to expose AMDMFMALayout in gluon so the kernel author
can use it for better performance on AMD.

---------

Co-authored-by: peterbell10 &lt;peterbell10@live.co.uk&gt;
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -115,6 +115,10 @@ jobs:
           if [ ! -d "${INSTRUMENTATION_LIB_DIR}" ]; then
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
+
+          # Test gluon
+          pytest --capture=tee-sys -rfs -n 8 python/test/gluon/
+
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -94,10 +94,14 @@ struct GluonLayouts {
   py::handle NVMMADistributedLayout;
   py::handle NVMMASharedLayout;
   py::handle SwizzledSharedLayout;
+  py::handle AMDMFMALayout;
+  py::handle GluonDType;
 
   GluonLayouts() {
     auto layouts =
         py::module::import("triton.experimental.gluon.language._layouts");
+    auto amdLayouts =
+        py::module::import("triton.experimental.gluon.language.amd._layouts");
     AutoLayout = py::object(layouts.attr("AutoLayout")).release();
     BlockedLayout = py::object(layouts.attr("BlockedLayout")).release();
     SliceLayout = py::object(layouts.attr("SliceLayout")).release();
@@ -109,6 +113,10 @@ struct GluonLayouts {
     NVMMASharedLayout = py::object(layouts.attr("NVMMASharedLayout")).release();
     SwizzledSharedLayout =
         py::object(layouts.attr("SwizzledSharedLayout")).release();
+    AMDMFMALayout = py::object(amdLayouts.attr("AMDMFMALayout")).release();
+
+    auto core = py::module::import("triton.language.core");
+    GluonDType = py::object(core.attr("dtype")).release();
   }
 };
 
@@ -186,7 +194,22 @@ py::object layoutToGluon(Attribute layout) {
         toStdVector(ctaLayout.getCTAOrder()));
   } else if (auto autoEnc = dyn_cast<gluon::AutoEncodingAttr>(layout)) {
     return layouts.AutoLayout();
+  } else if (auto amdMfma = dyn_cast<ttg::AMDMfmaEncodingAttr>(layout)) {
+    auto ctaLayout = amdMfma.getCTALayout();
+    std::vector<unsigned> instrShape{amdMfma.getMDim(), amdMfma.getNDim()};
+    auto isFP32 = !amdMfma.getElementType().has_value() ||
+                  amdMfma.getElementType().value().isF32();
+
+    return layouts.AMDMFMALayout(amdMfma.getVersion(), instrShape,
+                                 amdMfma.getIsTransposed(),
+                                 toStdVector(amdMfma.getWarpsPerCTA()),
+                                 toStdVector(amdMfma.getTilesPerWarp()),
+                                 layouts.GluonDType(isFP32 ? "fp32" : "fp64"),
+                                 toStdVector(ctaLayout.getCTAsPerCGA()),
+                                 toStdVector(ctaLayout.getCTASplitNum()),
+                                 toStdVector(ctaLayout.getCTAOrder()));
   }
+
   throw py::value_error("Unhandled encoding encountered");
 }
 
@@ -284,6 +307,22 @@ void init_gluon_ir(py::module &&m) {
                  ctx, version[0], version[1], warpsPerCta, ctaLayout,
                  instrShape);
            })
+      .def("get_amd_mfma_layout",
+           [](GluonOpBuilder &self, unsigned version,
+              std::vector<unsigned> &tilesPerWarp,
+              std::vector<unsigned> &warpsPerCta,
+              std::vector<unsigned> &ctasPerCga,
+              std::vector<unsigned> &ctaSplitNum,
+              std::vector<unsigned> &ctaOrder,
+              std::vector<unsigned> &instrShape, bool transposed,
+              mlir::Type elemType) -> Attribute {
+             auto ctx = self.getContext();
+             auto ctaLayout = self.getChecked<ttg::CTALayoutAttr>(
+                 ctx, ctasPerCga, ctaSplitNum, ctaOrder);
+             return ttg::AMDMfmaEncodingAttr::get(
+                 ctx, version, warpsPerCta, tilesPerWarp, instrShape[0],
+                 instrShape[1], transposed, ctaLayout, elemType);
+           })
       .def("get_nvmma_shared_layout",
            [](GluonOpBuilder &self, unsigned swizzleByteWidth,
               unsigned elementBitwidth, bool transposed, bool fp4Padded,
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -10,6 +10,7 @@
 from triton.experimental.gluon.language.nvidia import hopper
 from triton.experimental.gluon.language.nvidia.blackwell import mbarrier, tma, TensorMemoryLayout, async_copy
 from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
+from triton.experimental.gluon.language.amd import _layouts as amd_layouts
 from triton._filecheck import filecheck_test, run_parser
 from triton.runtime.jit import MockTensor
 import triton.language as tl
@@ -23,6 +24,8 @@
 HOPPER_TARGET = GPUTarget("cuda", 90, 32)
 AMPERE_TARGET = GPUTarget("cuda", 80, 32)
 HIP_TARGET = GPUTarget("hip", "gfx1200", 32)
+HIP_TARGET_CDNA3 = GPUTarget("hip", "gfx942", 64)
+HIP_TARGET_CDNA4 = GPUTarget("hip", "gfx950", 64)
 
 ALL_TARGETS = [AMPERE_TARGET, HOPPER_TARGET, BLACKWELL_TARGET, HIP_TARGET]
 
@@ -1338,3 +1341,91 @@ def test_auto_layout_broadcast():
     # CHECK: [[XBCAST2:%.*]] = tt.broadcast [[XCVT2]]
     # CHECK: arith.muli [[YBCAST2]], [[XBCAST2]] : tensor<16x16xi32, [[BLOCKED]]>
     _ = y * x
+
+
+@gluon.jit
+def amd_mfma_layout_kernel():
+    mfma_layout_fp32: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True,
+                                                                 warps_per_cta=[4, 1], tiles_per_warp=[4, 1],
+                                                                 ctas_per_cga=[1,
+                                                                               1], cta_split_num=[1,
+                                                                                                  1], cta_order=[1, 0])
+
+    mfma_layout_fp64: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[16, 16], transposed=True,
+                                                                 warps_per_cta=[4, 1], tiles_per_warp=[4, 1],
+                                                                 elem_type=ttgl.float64, ctas_per_cga=[1, 1],
+                                                                 cta_split_num=[1, 1], cta_order=[1, 0])
+
+    layout: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [1, 64], [4, 1], [1, 0])
+
+    x_fp32 = ttgl.full([128, 32], 0, ttgl.float32, layout)
+    x_fp64 = ttgl.full([128, 32], 0, ttgl.float64, layout)
+
+    ttgl.convert_layout(x_fp32, mfma_layout_fp32)
+    ttgl.convert_layout(x_fp64, mfma_layout_fp64)
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+def test_amd_mfma_layout(target):
+
+    module = run_parser(amd_mfma_layout_kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [16, 16], isTransposed = true, elementType = f64}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @amd_mfma_layout_kernel() attributes {noinline = false} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x32xf32, #blocked>
+    %cst_1 = arith.constant 0.000000e+00 : f64
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x32xf64, #blocked>
+    %0 = ttg.convert_layout %cst_0 : tensor<128x32xf32, #blocked> -> tensor<128x32xf32, #mma>
+    %1 = ttg.convert_layout %cst_2 : tensor<128x32xf64, #blocked> -> tensor<128x32xf64, #mma1>
+    tt.return
+  }
+}
+""")
+
+
+@gluon.jit
+def add_fp(a, b):
+    return a + b
+
+
+@gluon.jit
+def infer_layout_for_amd_mfma_kernel():
+    layout: ttgl.constexpr = amd_layouts.AMDMFMALayout(version=3, instr_shape=[32, 32], transposed=True,
+                                                       warps_per_cta=[4, 1], tiles_per_warp=[4, 1], ctas_per_cga=[1, 1],
+                                                       cta_split_num=[1, 1], cta_order=[1, 0])
+    a = ttgl.full([128, 32], 1.0, ttgl.float32, layout)
+    b = ttgl.reduce(a, 1, add_fp)
+    ttgl.static_assert(b.type.layout == ttgl.SliceLayout(1, layout))
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
+def test_infer_layout_for_amd_mfma(target):
+    module = run_parser(infer_layout_for_amd_mfma_kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [4, 1], tilesPerWarp = [4, 1], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @infer_layout_for_amd_mfma_kernel() attributes {noinline = false} {
+    %cst = arith.constant 1.000000e+00 : f32
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<128x32xf32, #mma>
+    %0 = "tt.reduce"(%cst_0) <{axis = 1 : i32}> ({
+    ^bb0(%arg0: f32, %arg1: f32):
+      %1 = tt.call @test_frontend.add_fp__fp32_fp32__(%arg0, %arg1) : (f32, f32) -> f32
+      tt.reduce.return %1 : f32
+    }) : (tensor<128x32xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>
+    tt.return
+  }
+  tt.func private @test_frontend.add_fp__fp32_fp32__(%arg0: f32, %arg1: f32) -> f32 attributes {noinline = false} {
+    %0 = arith.addf %arg0, %arg1 : f32
+    tt.return %0 : f32
+  ^bb1:  // no predecessors
+    %1 = ub.poison : f32
+    tt.return %1 : f32
+  }
+}
+""")
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -8,11 +8,13 @@
 from ._standard import __all__ as __standard_all
 
 from . import nvidia
+from . import amd
 
 __all__ = [
     *__core_all,
     *__layouts_all,
     *__math_all,
     *__standard_all,
     "nvidia",
+    "amd",
 ]
diff --git a/python/triton/experimental/gluon/language/amd/__init__.py b/python/triton/experimental/gluon/language/amd/__init__.py
@@ -0,0 +1,3 @@
+from ._layouts import AMDMFMALayout
+
+__all__ = ["AMDMFMALayout"]
diff --git a/python/triton/experimental/gluon/language/amd/_layouts.py b/python/triton/experimental/gluon/language/amd/_layouts.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List
+from triton.language.core import _unwrap_if_constexpr
+
+from triton.experimental.gluon.language._layouts import _realize_cta_layout, DistributedLayout
+from triton.experimental.gluon import language as ttgl
+
+__all__ = [
+    "AMDMFMALayout",
+]
+
+
+@dataclass(frozen=True)
+class AMDMFMALayout(DistributedLayout):
+    """
+    Represents a layout for AMD MFMA (matrix core) operations.
+
+    Args:
+        version (int): Major and minor identifier for the MFMA instruction.
+        instr_shape: (M, N) dimension for the instrinsic shape.
+        transposed: indicates the result tensor is transposed so that each thread holds consecutive elements in the same row instead of column, which is good for chained dot and global write.
+        warps_per_cta (List[int]): Number of warps per CTA.
+        tiles_per_warp: (List[int]): Number of tiles per WARP.
+        elem_type: fp32 or fp64
+        ctas_per_cga (Optional[List[int]]): CTAs per CGA grouping.
+        cta_split_num (Optional[List[int]]): Split factors for CTAs.
+        cta_order (Optional[List[int]]): CTA ordering.
+    """
+    version: int
+    instr_shape: List[int]
+    transposed: bool
+    warps_per_cta: List[int]
+    tiles_per_warp: List[int]
+    elem_type: ttgl.dtype = ttgl.float32
+    ctas_per_cga: List[int] | None = None
+    cta_split_num: List[int] | None = None
+    cta_order: List[int] | None = None
+
+    def __post_init__(self):
+        super().__setattr__("version", _unwrap_if_constexpr(self.version))
+        super().__setattr__("instr_shape", _unwrap_if_constexpr(self.instr_shape))
+        super().__setattr__("transposed", _unwrap_if_constexpr(self.transposed))
+        super().__setattr__("warps_per_cta", _unwrap_if_constexpr(self.warps_per_cta))
+        super().__setattr__("tiles_per_warp", _unwrap_if_constexpr(self.tiles_per_warp))
+        super().__setattr__("elem_type", _unwrap_if_constexpr(self.elem_type))
+        super().__setattr__("ctas_per_cga", _unwrap_if_constexpr(self.ctas_per_cga))
+        super().__setattr__("cta_split_num", _unwrap_if_constexpr(self.cta_split_num))
+        super().__setattr__("cta_order", _unwrap_if_constexpr(self.cta_order))
+
+        assert self.elem_type.is_fp32() or self.elem_type.is_fp64(
+        ), "The element type in AMDMFMALayout should be float32 or float64 type"
+
+        rank = len(self.cta_order)
+        _realize_cta_layout(self, rank)
+        assert len(self.ctas_per_cga) == rank
+        assert len(self.cta_split_num) == rank
+        assert len(self.cta_order) == rank
+
+    def _to_ir(self, builder):
+        type = builder.get_float_ty() if self.elem_type is ttgl.float32 else builder.get_double_ty()
+        return builder.get_amd_mfma_layout(self.version, self.tiles_per_warp, self.warps_per_cta, self.ctas_per_cga,
+                                           self.cta_split_num, self.cta_order, self.instr_shape, self.transposed, type)
+
+    def mangle(self) -> str:
+
+        def stringify(x):
+            if x is None:
+                return ""
+            return "_".join(map(str, x))
+
+        return f"MFMA_{self.version}_{stringify(self.instr_shape)}_{self.transposed}_{stringify(self.warps_per_cta)}_{stringify(self.tiles_per_warp)}_{self.elem_type}_{stringify(self.ctas_per_cga)}_{stringify(self.cta_split_num)}_{stringify(self.cta_order)}_MFMA"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from ._layouts import AMDMFMALayout`
	`2`	`+`
	`3`	`+__all__ = ["AMDMFMALayout"]`