[AMD][GLUON] Expose WMMA for RDNA3 and RDNA4 (triton-lang#8111)

borontion · web-flow · commit 4e7dc918494f · 2025-09-09T08:52:38.000-07:00
This PR exposes WMMA instruction for RDNA3 and RDNA4. Similar to
`amd.cdna4.mfma`, both are wrapper around `tl.dot` with additional
checks of input layouts. Add a wmma kernel which has been tested on
Radeon RX 9070 XT (RDNA4) and Radeon RX 7900 XTX (RDNA3).
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -8,6 +8,8 @@
 from triton._internal_testing import (
     is_ampere_or_newer,
     is_blackwell,
+    is_hip_gfx11,
+    is_hip_gfx12,
     is_hip_cdna3,
     is_hip_cdna4,
     is_hopper_or_newer,
@@ -202,6 +204,67 @@ def kernel(a_ptr, b_ptr, use_buffer_load: ttgl.constexpr):
     assert 'vmcnt(0)' in pgm.asm['amdgcn']
 
 
+@pytest.mark.skipif(not (is_hip_gfx11() or is_hip_gfx12()), reason="Requires RDNA3 or RDNA4")
+@pytest.mark.parametrize("M, N, K", [(64, 64, 64)])
+@pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
+def test_amd_wmma(M, N, K, in_dtype):
+
+    @gluon.jit
+    def kernel(a_ptr, b_ptr, c_ptr,  #
+               stride_am, stride_ak,  #
+               stride_bk, stride_bn,  #
+               stride_cm, stride_cn,  #
+               BLOCK_SIZE_M: ttgl.constexpr,  #
+               BLOCK_SIZE_N: ttgl.constexpr,  #
+               BLOCK_SIZE_K: ttgl.constexpr,  #
+               BLOCKED_LAYOUT: ttgl.constexpr,  #
+               WMMA_LAYOUT: ttgl.constexpr,  #
+               K_WIDTH: ttgl.constexpr):
+        offs_am = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, BLOCKED_LAYOUT))
+        offs_bn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, BLOCKED_LAYOUT))
+
+        offs_ak = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(0, BLOCKED_LAYOUT))
+        offs_bk = ttgl.arange(0, BLOCK_SIZE_K, layout=ttgl.SliceLayout(1, BLOCKED_LAYOUT))
+
+        offs_a = offs_am[:, None] * stride_am + offs_ak[None, :] * stride_ak
+        offs_b = offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn
+
+        a = ttgl.load(a_ptr + offs_a)
+        b = ttgl.load(b_ptr + offs_b)
+
+        a = ttgl.convert_layout(a, layout=ttgl.DotOperandLayout(0, WMMA_LAYOUT, K_WIDTH))
+        b = ttgl.convert_layout(b, layout=ttgl.DotOperandLayout(1, WMMA_LAYOUT, K_WIDTH))
+
+        acc = ttgl.zeros([BLOCK_SIZE_M, BLOCK_SIZE_N], ttgl.float32, WMMA_LAYOUT)
+        if WMMA_LAYOUT.version == 1:
+            c = ttgl.amd.rdna3.wmma(a, b, acc)
+        else:
+            ttgl.static_assert(WMMA_LAYOUT.version == 2, "WMMA_LAYOUT.version must be 1 or 2")
+            c = ttgl.amd.rdna4.wmma(a, b, acc)
+        c = c.to(a_ptr.dtype.element_ty)
+
+        offs_cm = ttgl.arange(0, BLOCK_SIZE_M, layout=ttgl.SliceLayout(1, WMMA_LAYOUT))
+        offs_cn = ttgl.arange(0, BLOCK_SIZE_N, layout=ttgl.SliceLayout(0, WMMA_LAYOUT))
+        offs_c = offs_cm[:, None] * stride_cm + offs_cn[None, :] * stride_cn
+        ttgl.store(c_ptr + offs_c, c)
+
+    elem_type = torch.float16 if in_dtype == 'float16' else torch.bfloat16
+    a = torch.randn((M, K), device='cuda', dtype=elem_type)
+    b = torch.randn((K, N), device='cuda', dtype=elem_type)
+    c = torch.empty((M, N), device=a.device, dtype=elem_type)
+
+    blocked = ttgl.BlockedLayout([1, 8], [4, 8], [4, 1], [1, 0])
+    wmma_version = 1 if is_hip_gfx11() else 2
+    k_width = 16 if is_hip_gfx11() else 8
+    wmma = ttgl.amd.AMDWMMALayout(wmma_version, True, [2, 2])
+    kernel[1, 1](a, b, c, a.stride(0), a.stride(1), b.stride(0), b.stride(1), c.stride(0), c.stride(1), BLOCK_SIZE_M=M,
+                 BLOCK_SIZE_N=N, BLOCK_SIZE_K=K, BLOCKED_LAYOUT=blocked, WMMA_LAYOUT=wmma, K_WIDTH=k_width, num_warps=4)
+
+    ref = torch.matmul(a, b)
+    triton_output = c
+    torch.testing.assert_close(ref, triton_output)
+
+
 @pytest.mark.parametrize("M, N, K", [(32, 32, 16), (16, 16, 32)])
 @pytest.mark.parametrize("in_dtype", ['float16', 'bfloat16'])
 @pytest.mark.parametrize("num_warps", [4, 8])
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -26,6 +26,7 @@
 BLACKWELL_TARGET = GPUTarget("cuda", 100, 32)
 HOPPER_TARGET = GPUTarget("cuda", 90, 32)
 AMPERE_TARGET = GPUTarget("cuda", 80, 32)
+HIP_TARGET_RDNA3 = GPUTarget("hip", "gfx1100", 32)
 HIP_TARGET_RDNA4 = GPUTarget("hip", "gfx1200", 32)
 HIP_TARGET_CDNA3 = GPUTarget("hip", "gfx942", 64)
 HIP_TARGET_CDNA4 = GPUTarget("hip", "gfx950", 64)
@@ -2129,6 +2130,78 @@ def test_buffer_load_store_with_broadcast(target):
 """)
 
 
+@pytest.mark.parametrize("target", [HIP_TARGET_RDNA3])
+def test_amd_rdna3_wmma(target):
+
+    @gluon.jit
+    def kernel():
+        wmma_layout: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=1, transposed=True, warps_per_cta=[4, 1])
+
+        a = ttgl.full([64, 64], 1.0, ttgl.float16, layout=ttgl.DotOperandLayout(0, wmma_layout, 16))
+        b = ttgl.full([64, 64], 2.0, ttgl.float16, layout=ttgl.DotOperandLayout(1, wmma_layout, 16))
+
+        acc = ttgl.full([64, 64], 0.0, ttgl.float32, layout=wmma_layout)
+        acc = ttgl.amd.rdna3.wmma(a, b, acc)
+
+        ttgl.static_assert(isinstance(acc, ttgl.tensor))
+        ttgl.static_assert(acc.type.layout == wmma_layout)
+
+    module = run_parser(kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#mma = #ttg.amd_wmma<{version = 1, isTranspose = true, warpsPerCTA = [4, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @kernel() attributes {noinline = false} {
+    %cst = arith.constant 1.000000e+00 : f16
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    %cst_1 = arith.constant 2.000000e+00 : f16
+    %cst_2 = arith.constant dense<2.000000e+00> : tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>
+    %cst_3 = arith.constant 0.000000e+00 : f32
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %cst_5 = arith.constant 0.000000e+00 : f32
+    %0 = tt.dot %cst_0, %cst_2, %cst_4 : tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+}
+""")
+
+
+@pytest.mark.parametrize("target", [HIP_TARGET_RDNA4])
+def test_amd_rdna4_wmma(target):
+
+    @gluon.jit
+    def kernel():
+        wmma_layout: ttgl.constexpr = ttgl.amd.AMDWMMALayout(version=2, transposed=True, warps_per_cta=[4, 1])
+
+        a = ttgl.full([64, 64], 1.0, ttgl.float16, layout=ttgl.DotOperandLayout(0, wmma_layout, 8))
+        b = ttgl.full([64, 64], 2.0, ttgl.float16, layout=ttgl.DotOperandLayout(1, wmma_layout, 8))
+
+        acc = ttgl.full([64, 64], 0.0, ttgl.float32, layout=wmma_layout)
+        acc = ttgl.amd.rdna4.wmma(a, b, acc)
+
+        ttgl.static_assert(isinstance(acc, ttgl.tensor))
+        ttgl.static_assert(acc.type.layout == wmma_layout)
+
+    module = run_parser(kernel, target=target)
+    expecttest.assert_expected_inline(
+        anonymize_ir(module.str_nodebug()), """\
+#mma = #ttg.amd_wmma<{version = 2, isTranspose = true, warpsPerCTA = [4, 1]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
+  tt.func public @kernel() attributes {noinline = false} {
+    %cst = arith.constant 1.000000e+00 : f16
+    %cst_0 = arith.constant dense<1.000000e+00> : tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %cst_1 = arith.constant 2.000000e+00 : f16
+    %cst_2 = arith.constant dense<2.000000e+00> : tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+    %cst_3 = arith.constant 0.000000e+00 : f32
+    %cst_4 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma>
+    %cst_5 = arith.constant 0.000000e+00 : f32
+    %0 = tt.dot %cst_0, %cst_2, %cst_4 : tensor<64x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<64x64xf32, #mma>
+    tt.return
+  }
+}
+""")
+
+
 @pytest.mark.parametrize("target", [HIP_TARGET_CDNA3, HIP_TARGET_CDNA4])
 def test_amd_mfma(target):
 
diff --git a/python/triton/experimental/gluon/language/amd/__init__.py b/python/triton/experimental/gluon/language/amd/__init__.py
@@ -1,4 +1,5 @@
 from ._layouts import AMDMFMALayout, AMDWMMALayout
 from . import cdna3, cdna4
+from . import rdna3, rdna4
 
-__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4"]
+__all__ = ["AMDMFMALayout", "AMDWMMALayout", "cdna3", "cdna4", "rdna3", "rdna4"]
diff --git a/python/triton/experimental/gluon/language/amd/rdna3/__init__.py b/python/triton/experimental/gluon/language/amd/rdna3/__init__.py
@@ -0,0 +1,36 @@
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._semantic import _check
+
+from .._layouts import AMDWMMALayout
+from ..._layouts import DotOperandLayout
+from ..._core import builtin
+
+__all__ = ["wmma"]
+
+
+@builtin
+def wmma(a, b, acc, _semantic=None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD WMMA instruction.
+
+    Args:
+        a (tensor): The operand a to be multiplied.
+        b (tensor): The operand b to be multiplied.
+        acc (tensor): The accumulator tensor.
+    """
+    _check(acc is not None, lambda: "acc is required")
+    layout = acc.type.layout
+    _check(
+        isinstance(layout, AMDWMMALayout) and layout.version == 1,
+        lambda: "Expected layout to be an instance of AMDWMMALayout with version 1")
+    _check(
+        isinstance(a.type.layout, DotOperandLayout) and a.type.layout.parent == layout,
+        lambda: "Expected a's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+    _check(
+        isinstance(b.type.layout, DotOperandLayout) and b.type.layout.parent == layout,
+        lambda: "Expected b's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+
+    handle = _semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None,
+                           out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, acc.type)
diff --git a/python/triton/experimental/gluon/language/amd/rdna4/__init__.py b/python/triton/experimental/gluon/language/amd/rdna4/__init__.py
@@ -0,0 +1,36 @@
+from triton import knobs
+from triton.experimental.gluon.language import _core as ttgl
+from triton.experimental.gluon.language._semantic import _check
+
+from .._layouts import AMDWMMALayout
+from ..._layouts import DotOperandLayout
+from ..._core import builtin
+
+__all__ = ["wmma"]
+
+
+@builtin
+def wmma(a, b, acc, _semantic=None):
+    """
+    Computes matrix-multiplication of a * b + acc using AMD WMMA instruction.
+
+    Args:
+        a (tensor): The operand a to be multiplied.
+        b (tensor): The operand b to be multiplied.
+        acc (tensor): The accumulator tensor.
+    """
+    _check(acc is not None, lambda: "acc is required")
+    layout = acc.type.layout
+    _check(
+        isinstance(layout, AMDWMMALayout) and layout.version == 2,
+        lambda: "Expected layout to be an instance of AMDWMMALayout with version 2")
+    _check(
+        isinstance(a.type.layout, DotOperandLayout) and a.type.layout.parent == layout,
+        lambda: "Expected a's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+    _check(
+        isinstance(b.type.layout, DotOperandLayout) and b.type.layout.parent == layout,
+        lambda: "Expected b's layout to be a DotOperandLayout with parent matching AMDWMMALayout")
+
+    handle = _semantic.dot(a, b, acc, input_precision=knobs.language.fp32_default, max_num_imprecise_acc=None,
+                           out_dtype=acc.dtype).handle
+    return ttgl.tensor(handle, acc.type)