add rocdl.wmma_f16_16x16x16_f16 (#140)

makslevental · web-flow · commit 6f9f150faf67 · 2025-04-11T02:39:10.000Z
diff --git a/mlir/extras/dialects/ext/rocdl.py b/mlir/extras/dialects/ext/rocdl.py
@@ -0,0 +1,60 @@
+from . import arith
+from ...util import get_user_code_loc
+
+from ....dialects._ods_common import (
+    _dispatch_mixed_values,
+    _cext,
+    get_op_results_or_values,
+    get_default_loc_context,
+    get_op_result_or_op_results,
+    get_default_loc_context,
+    segmented_accessor,
+)
+
+# noinspection PyUnresolvedReferences
+from ....dialects.rocdl import *
+from ....dialects._rocdl_ops_gen import _Dialect
+from .... import ir
+
+
+@_cext.register_operation(_Dialect, replace=True)
+class WMMA_F16_16X16X16_F16(ir.OpView):
+    OPERATION_NAME = "rocdl.wmma.f16.16x16x16.f16"
+
+    _ODS_REGIONS = (0, True)
+
+    def __init__(self, res, args, *, loc=None, ip=None):
+        operands = []
+        results = []
+        attributes = {}
+        regions = None
+        operands.extend(get_op_results_or_values(args))
+        _ods_context = get_default_loc_context(loc)
+        results.append(res)
+        _ods_successors = None
+        super().__init__(
+            self.OPERATION_NAME,
+            self._ODS_REGIONS,
+            self._ODS_OPERAND_SEGMENTS,
+            self._ODS_RESULT_SEGMENTS,
+            attributes=attributes,
+            results=results,
+            operands=operands,
+            successors=_ods_successors,
+            regions=regions,
+            loc=loc,
+            ip=ip,
+        )
+
+    @property
+    def args(self):
+        _ods_variadic_group_length = len(self.operation.operands) - 1 + 1
+        return self.operation.operands[0 : 0 + _ods_variadic_group_length]
+
+    @property
+    def res(self):
+        return self.operation.results[0]
+
+
+def wmma_f16_16x16x16_f16(res, args, *, loc=None, ip=None) -> ir.Value:
+    return WMMA_F16_16X16X16_F16(res=res, args=args, loc=loc, ip=ip).result
diff --git a/tests/test_gpu.py b/tests/test_gpu.py
@@ -15,7 +15,7 @@
 from mlir.dialects.memref import cast
 
 from mlir.extras.ast.canonicalize import canonicalize
-from mlir.extras.dialects.ext import arith, scf, memref
+from mlir.extras.dialects.ext import arith, scf, memref, rocdl
 from mlir.extras.dialects.ext.func import func
 
 # noinspection PyUnresolvedReferences
@@ -36,7 +36,7 @@
 )
 from mlir.extras.dialects.ext.llvm import llvm_ptr_t
 from mlir.extras.dialects.ext.scf import forall, in_parallel_
-from mlir.extras.dialects.ext.vector import outer, load, shuffle
+from mlir.extras.dialects.ext.vector import outer, load, shuffle, print_
 from mlir.extras.runtime.passes import run_pipeline, Pipeline
 
 # noinspection PyUnresolvedReferences
@@ -1193,3 +1193,135 @@ def gpu_module():
     times[all_bank_conflicts.__name__] /= runs
     for k, v in times.items():
         print(f"{k}: {v:.3e}ms")
+
+
+# https://gpuopen.com/learn/wmma_on_rdna3/
+@pytest.mark.skipif(hip_bindings_not_installed(), reason="hip not installed")
+def test_amdgpu_vector_wmma(ctx: MLIRContext):
+    from hip import hip
+
+    set_container_module(ctx.module)
+
+    v_len = 16
+    M, K, N = v_len, v_len, v_len
+    v16f16 = T.vector(v_len, T.f16())
+
+    @gpu_func
+    @canonicalize(using=scf.canonicalizer)
+    def smol_matmul(
+        a: T.memref(M, K, T.f16()),
+        b: T.memref(K, N, T.f16()),
+        c: T.memref(M, N, T.f16()),
+    ):
+        lIdx = thread_idx.x
+        # a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
+        # a_frag will store one column of the 16x16 matrix A tile
+        # b_frag will store one row of the 16x16 matrix B tile
+        a_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
+        b_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
+        c_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
+
+        # lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
+        lane = lIdx % v_len
+        for ele, [a_frag, b_frag], _ in scf.range_(v_len, iter_args=[a_frag, b_frag]):
+            b_frag[ele] = b[ele, lane]
+            a_frag[ele] = a[lane, ele]
+            a_frag, b_frag = yield a_frag, b_frag
+
+        # call the WMMA intrinsic
+        false = arith.constant(False, T.bool())
+        c_frag = rocdl.wmma_f16_16x16x16_f16(v16f16, [a_frag, b_frag, c_frag, false])
+
+        for ele in scf.range_(v_len // 2):
+            r = ele * 2 + (lIdx // v_len)
+            # store results from unpacked c_frag output
+            c[r, lane] = c_frag[ele * 2]
+
+    props = hip.hipDeviceProp_t()
+    hip_check(hip.hipGetDeviceProperties(props, 0))
+    arch = props.gcnArchName.decode()
+
+    @module("naive", [f'#rocdl.target<chip = "{arch}", abi = "500">'])
+    def gpu_module():
+        smol_matmul.emit()
+
+    print(gpu_module)
+
+    lowered_module = run_pipeline(
+        gpu_module,
+        Pipeline()
+        .Gpu(Pipeline().convert_gpu_to_rocdl(use_bare_ptr_memref_call_conv=True))
+        .rocdl_attach_target(chip=arch, abi="500")
+        .gpu_to_llvm()
+        .lower_to_llvm()
+        .gpu_module_to_binary(),
+    )
+
+    hsaco = get_compile_object_bytes(lowered_module)
+    hip_module = hip_check(hip.hipModuleLoadData(hsaco))
+    function = hip_check(
+        hip.hipModuleGetFunction(hip_module, smol_matmul.__name__.encode())
+    )
+
+    a_h = np.random.randint(0, 10, (M, K)).astype(dtype=np.float16)
+    b_h = np.random.randint(0, 10, (K, N)).astype(dtype=np.float16)
+    c_h = -3 * np.ones((M, N), dtype=np.float16)
+
+    a_num_bytes = a_h.size * a_h.itemsize
+    b_num_bytes = b_h.size * b_h.itemsize
+    c_num_bytes = c_h.size * c_h.itemsize
+
+    a_d = hip_check(hip.hipMalloc(a_num_bytes))
+    b_d = hip_check(hip.hipMalloc(b_num_bytes))
+    c_d = hip_check(hip.hipMalloc(c_num_bytes))
+
+    hip_check(
+        hip.hipMemcpy(a_d, a_h, a_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)
+    )
+    hip_check(
+        hip.hipMemcpy(b_d, b_h, b_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)
+    )
+    hip_check(
+        hip.hipMemcpy(c_d, c_h, c_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)
+    )
+
+    gridX = 1
+    gridY = 1
+    gridZ = 1
+    warp_size = 32
+    num_warps = 1
+    stream = 0
+    shared_memory = 0
+
+    launch_kernel(
+        function.as_c_void_p(),
+        gridX,
+        gridY,
+        gridZ,
+        warp_size,
+        num_warps,
+        stream,
+        shared_memory,
+        a_d,
+        b_d,
+        c_d,
+    )
+
+    correct = a_h @ b_h
+    assert np.allclose(c_h, -3.0)
+    assert not np.allclose(correct, c_h)
+    hip_check(
+        hip.hipMemcpy(c_h, c_d, c_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost)
+    )
+
+    if not np.allclose(c_h, correct):
+        with np.printoptions(threshold=np.inf, linewidth=200):
+            print(correct)
+            print(c_h)
+            assert False
+
+    hip_check(hip.hipFree(a_d))
+    hip_check(hip.hipFree(b_d))
+    hip_check(hip.hipFree(c_d))
+
+    hip_check(hip.hipModuleUnload(hip_module))