[AMD] Add Tests for MXFP GEMM Gluon Kernel for GFX1250 (#8371)

knwng · web-flow · commit 0173f7524d8c · 2025-10-06T16:54:45.000-07:00
This PR added tests for MXFP GEMM Gluon Kernel for GFX1250.
diff --git a/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py b/python/triton/experimental/gluon/language/amd/gfx1250/__init__.py
@@ -59,8 +59,8 @@ def wmma_scaled(a, a_scale, a_format, b, b_scale, b_format, acc, _semantic=None)
     "accumulator tensor's layout must be (16, 16, 128)"
 
     # TODO: Add more formats
-    assert a_format.value in {"e2m1"}, f"Unsupported lhs_format: {a_format.value}"
-    assert b_format.value in {"e2m1"}, f"Unsupported rhs_format: {b_format.value}"
+    assert a_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported lhs_format: {a_format.value}"
+    assert b_format.value in {"e2m1", "e4m3", "e5m2"}, f"Unsupported rhs_format: {b_format.value}"
 
     assert a_scale is not None and b_scale is not None, "Scales must not be None"
 
diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py
@@ -422,3 +422,192 @@ def test_runtime_tensor_copy(BLOCK_M, BLOCK_N):
 
     b_triton = b_device.cpu()
     assert torch.equal(b_triton, a)
+
+
+@gluon.jit
+def mxgemm_kernel(a_ptr, b_ptr, c_ptr, a_scale, b_scale, M, N, K, stride_am, stride_ak, stride_bk, stride_bn, stride_cm,
+                  stride_cn, stride_scale, DTYPE_A: ttgl.constexpr, DTYPE_B: ttgl.constexpr,
+                  SCALE_BLOCK: ttgl.constexpr, BLOCK_M: ttgl.constexpr, BLOCK_N: ttgl.constexpr,
+                  BLOCK_K: ttgl.constexpr, GROUP_SIZE_M: ttgl.constexpr):
+    DIV_FACTOR_A: ttgl.constexpr = 2 if DTYPE_A == "e2m1" else 1
+    DIV_FACTOR_B: ttgl.constexpr = 2 if DTYPE_B == "e2m1" else 1
+    BLOCK_K_SCALE: ttgl.constexpr = BLOCK_K // SCALE_BLOCK
+    BLOCK_K_PACKED_A: ttgl.constexpr = BLOCK_K // DIV_FACTOR_A
+    BLOCK_K_PACKED_B: ttgl.constexpr = BLOCK_K // DIV_FACTOR_B
+
+    BLOCKED_LAYOUT: ttgl.constexpr = ttgl.BlockedLayout([1, 1], [8, 4], [4, 1], [1, 0])
+    A_BLOCKED_LAYOUT: ttgl.constexpr = ttgl.BlockedLayout([1, 16], [8, 4], [4, 1], [1, 0])
+    B_BLOCKED_LAYOUT: ttgl.constexpr = ttgl.BlockedLayout([1, 16], [16, 2], [4, 1], [1, 0])
+
+    WMMA_LAYOUT: ttgl.constexpr = ttgl.amd.AMDWMMALayout(3, transposed=True, warps_per_cta=[2, 2],
+                                                         instr_shape=[16, 16, 128])
+    WMMA_LAYOUT_PACKED: ttgl.constexpr = ttgl.amd.AMDWMMALayout(3, transposed=True, warps_per_cta=[2, 2],
+                                                                instr_shape=[16, 16, 64])
+    A_SCALE_LINEAR_LAYOUT: ttgl.constexpr = ttgl.DistributedLinearLayout(
+        reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp_bases=[[0, 0], [16, 0]],
+        block_bases=[], shape=[32, 4])
+    B_SCALE_LINEAR_LAYOUT: ttgl.constexpr = ttgl.DistributedLinearLayout(
+        reg_bases=[[0, 1], [0, 2]], lane_bases=[[1, 0], [2, 0], [4, 0], [8, 0], [0, 0]], warp_bases=[[16, 0], [0, 0]],
+        block_bases=[], shape=[32, 4])
+
+    DOT_LAYOUT_A: ttgl.constexpr = ttgl.DotOperandLayout(
+        operand_index=0, parent=WMMA_LAYOUT_PACKED if DTYPE_A == "e2m1" else WMMA_LAYOUT, k_width=16)
+    DOT_LAYOUT_B: ttgl.constexpr = ttgl.DotOperandLayout(
+        operand_index=1, parent=WMMA_LAYOUT_PACKED if DTYPE_B == "e2m1" else WMMA_LAYOUT, k_width=16)
+
+    pid = ttgl.program_id(axis=0)
+    num_pid_m = ttgl.cdiv(M, BLOCK_M)
+    num_pid_n = ttgl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_M + ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, A_BLOCKED_LAYOUT))) % M
+    offs_ak = ttgl.arange(0, BLOCK_K_PACKED_A, layout=ttgl.SliceLayout(0, A_BLOCKED_LAYOUT))
+    offs_bk = ttgl.arange(0, BLOCK_K_PACKED_B, layout=ttgl.SliceLayout(1, B_BLOCKED_LAYOUT))
+    offs_bn = (pid_n * BLOCK_N + ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(0, B_BLOCKED_LAYOUT))) % N
+
+    offs_scale_am = (pid_m * BLOCK_M + ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, BLOCKED_LAYOUT))) % M
+    offs_scale_ak = ttgl.arange(0, BLOCK_K_SCALE, layout=ttgl.SliceLayout(0, BLOCKED_LAYOUT))
+    offs_scale_bn = (pid_n * BLOCK_N + ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(1, BLOCKED_LAYOUT))) % N
+    offs_scale_bk = ttgl.arange(0, BLOCK_K_SCALE, layout=ttgl.SliceLayout(0, BLOCKED_LAYOUT))
+
+    a_scale_ptr = a_scale + offs_scale_am[:, None] * stride_scale + offs_scale_ak[None, :]
+    b_scale_ptr = b_scale + offs_scale_bn[:, None] * stride_scale + offs_scale_bk[None, :]
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_ak[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+
+    accumulator = ttgl.zeros((BLOCK_M, BLOCK_N), dtype=ttgl.float32, layout=WMMA_LAYOUT)
+    for k in range(0, ttgl.cdiv(K, BLOCK_K)):
+        k_remaining_a = K - k * BLOCK_K_PACKED_A
+        k_remaining_b = K - k * BLOCK_K_PACKED_B
+        valid_k_a = offs_ak < k_remaining_a
+        valid_k_b = offs_bk < k_remaining_b
+
+        scale_a = ttgl.load(a_scale_ptr)
+        scale_b = ttgl.load(b_scale_ptr)
+        scale_a = ttgl.convert_layout(scale_a, A_SCALE_LINEAR_LAYOUT)
+        scale_b = ttgl.convert_layout(scale_b, B_SCALE_LINEAR_LAYOUT)
+
+        a = ttgl.load(a_ptrs, mask=valid_k_a[None, :], other=0.0)
+        b = ttgl.load(b_ptrs, mask=valid_k_b[:, None], other=0.0)
+        a = ttgl.convert_layout(a, DOT_LAYOUT_A)
+        b = ttgl.convert_layout(b, DOT_LAYOUT_B)
+
+        accumulator = ttgl.amd.gfx1250.wmma_scaled(a, scale_a, DTYPE_A, b, scale_b, DTYPE_B, accumulator)
+
+        a_ptrs += BLOCK_K_PACKED_A * stride_ak
+        b_ptrs += BLOCK_K_PACKED_B * stride_bk
+
+        a_scale_ptr += BLOCK_K_SCALE
+        b_scale_ptr += BLOCK_K_SCALE
+
+    offs_cm = pid_m * BLOCK_M + ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, WMMA_LAYOUT))
+    offs_cn = pid_n * BLOCK_N + ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(0, WMMA_LAYOUT))
+    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    ttgl.store(c_ptrs, accumulator, mask=c_mask)
+
+
+@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(32, 32, 64), (32, 32, 128)])
+@pytest.mark.parametrize("DTYPE_A", ["float8_e5m2", "float8_e4m3", "float4"])
+@pytest.mark.parametrize("DTYPE_B", ["float8_e5m2", "float8_e4m3", "float4"])
+def test_compile_mxgemm(BLOCK_M, BLOCK_N, BLOCK_K, DTYPE_A, DTYPE_B):
+    scale_block = 32
+
+    if BLOCK_K < 128:
+        pytest.skip("NYI: don't support block shape smaller than instr shape")
+
+    triton_dtype_converter = {'float8_e5m2': "fp8e5", "float8_e4m3": "fp8e4nv", "float4": "u8"}
+    dot_scaled_dtype_converter = {'float8_e5m2': "e5m2", "float8_e4m3": "e4m3", "float4": "e2m1"}
+
+    k = triton.compile(
+        gluon._runtime.GluonASTSource(
+            fn=mxgemm_kernel, signature={
+                "a_ptr": f"*{triton_dtype_converter[DTYPE_A]}", "b_ptr": f"*{triton_dtype_converter[DTYPE_B]}", "c_ptr":
+                "*fp32", "a_scale": "*u8", "b_scale": "*u8", "M": "i32", "N": "i32", "K": "i32", "stride_am": "i32",
+                "stride_ak": "i32", "stride_bk": "i32", "stride_bn": "i32", "stride_cm": "i32", "stride_cn": "i32",
+                "stride_scale": "i32", "DTYPE_A": "constexpr", "DTYPE_B": "constexpr", "SCALE_BLOCK": "constexpr",
+                "BLOCK_M": "constexpr", "BLOCK_N": "constexpr", "BLOCK_K": "constexpr", "GROUP_SIZE_M": "constexpr"
+            }, constexprs={
+                "DTYPE_A": dot_scaled_dtype_converter[DTYPE_A], "DTYPE_B": dot_scaled_dtype_converter[DTYPE_B],
+                "SCALE_BLOCK": scale_block, "BLOCK_M": BLOCK_M, "BLOCK_N": BLOCK_N, "BLOCK_K": BLOCK_K, "GROUP_SIZE_M":
+                1
+            }), target=GPUTarget("hip", 'gfx1250', 32))
+
+    amdgcn = k.asm["amdgcn"]
+    pattern = "v_wmma_scale_f32_16x16x128_f8f6f4"
+    assert re.search(pattern, amdgcn), f"Can't find instruction {pattern} in AMDGCN assembly"
+
+
+@pytest.mark.parametrize("M, N, K", [(32, 32, 128), (128, 128, 512), (1, 8192, 512)])
+@pytest.mark.parametrize("BLOCK_M, BLOCK_N, BLOCK_K", [(32, 32, 128), (64, 64, 128)])
+@pytest.mark.parametrize("DTYPE_A", ["float8_e5m2", "float8_e4m3", "float4"])
+@pytest.mark.parametrize("DTYPE_B", ["float8_e5m2", "float8_e4m3", "float4"])
+def test_runtime_mxgemm(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, DTYPE_A, DTYPE_B):
+    scale_block = 32
+
+    torch.manual_seed(0)
+
+    def torch_gemm_mxfp(a, b, a_scale, b_scale, scale_block, M, N, K):
+        a_scale_f32 = a_scale.to(torch.float32).repeat_interleave(scale_block, dim=1)[:M, :K]
+        b_scale_f32 = b_scale.to(torch.float32).repeat_interleave(scale_block, dim=1).T.contiguous()[:K, :N]
+
+        a_f32 = a.to(torch.float32)
+        b_f32 = b.to(torch.float32)
+
+        return torch.matmul(a_f32 * a_scale_f32, b_f32 * b_scale_f32).to(torch.float32)
+
+    def init_data(dtype, d0: int, d1: int):
+        if dtype == 'float4':
+            return MXFP4Tensor(size=(d0, d1)).random()
+        elif dtype == "float8_e5m2":
+            return torch.randint(20, 40, (d0, d1), dtype=torch.uint8).view(torch.float8_e5m2)
+        elif dtype == "float8_e4m3":
+            return torch.randint(20, 40, (d0, d1), dtype=torch.uint8).view(torch.float8_e4m3fn)
+        else:
+            raise NotImplementedError(f"NYI: unsupported dtype: {dtype}")
+
+    a = init_data(DTYPE_A, M, K)
+    b = init_data(DTYPE_B, K, N)
+    a_size = (M, (K + scale_block - 1) // scale_block)
+    b_size = (N, (K + scale_block - 1) // scale_block)
+    a_scale = MXScaleTensor(size=a_size).random(low=1.0, high=32.0)
+    b_scale = MXScaleTensor(size=b_size).random(low=1.0, high=32.0)
+
+    c_ref = torch_gemm_mxfp(a, b, a_scale, b_scale, scale_block, M, N, K)
+
+    a_scale = a_scale.data
+    b_scale = b_scale.data
+
+    # mxfp4 input needs packed along the k dim, i.e., two mxfp4 are packed in one uint8
+    if DTYPE_A in ['float4', 'float6_e2m3', 'float6_e3m2']:
+        a = a.to_packed_tensor(dim=1)
+    if DTYPE_B in ['float4', 'float6_e2m3', 'float6_e3m2']:
+        b = b.to_packed_tensor(dim=0)
+
+    c_d = torch.zeros(M, N, dtype=torch.float32).cuda()
+    a_d = a.data.contiguous().cuda()
+    b_d = b.data.contiguous().cuda()
+    a_scale_d = a_scale.cuda()
+    b_scale_d = b_scale.cuda()
+
+    stride_am, stride_ak = a_d.stride(0), a_d.stride(1)
+    stride_bk, stride_bn = b_d.stride(0), b_d.stride(1)
+    stride_cm, stride_cn = c_d.stride(0), c_d.stride(1)
+    stride_scale = a_scale_d.stride(0)
+
+    numBlocks = triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N)
+    grid = [numBlocks, 1, 1]
+    group_size_m = 1
+
+    dtype_converter = {'float8_e5m2': "e5m2", "float8_e4m3": "e4m3", "float4": "e2m1"}
+
+    mxgemm_kernel[grid](a_d, b_d, c_d, a_scale_d, b_scale_d, M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
+                        stride_cm, stride_cn, stride_scale, dtype_converter[DTYPE_A], dtype_converter[DTYPE_B],
+                        scale_block, BLOCK_M, BLOCK_N, BLOCK_K, group_size_m, num_warps=4, num_ctas=1)
+
+    torch.testing.assert_close(c_d.cpu(), c_ref.cpu(), rtol=1e-5, atol=1e-8)