[AMD] Add HIP AOT support to compile.py tool (#7007)

fsx950223 · xinyazhang · antiagainst · web-flow · commit 355dc4726ca5 · 2025-06-29T14:26:02.000-07:00
This commit adds HIP AOT compilation support to the
`compile.py` tool. It allows compiling Triton kernels
into a `.h` and `.cpp` file that can be integrated into
applications. Linking via `link.py` is not yet enabled and
a task for later.

---------

Co-authored-by: Xinya Zhang &lt;Xinya.Zhang@amd.com&gt;
Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -101,7 +101,7 @@ jobs:
           pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
-          pytest --capture=tee-sys -rfs -n 12 language runtime \
+          pytest --capture=tee-sys -rfs -n 12 language runtime tools \
                  --ignore=language/test_line_info.py \
                  --ignore=test_debug.py
           # TODO: uncomment
diff --git a/python/test/unit/tools/test_aot.py b/python/test/unit/tools/test_aot.py
@@ -1,5 +1,7 @@
 import glob
 import os
+import pytest
+import re
 import subprocess
 import sys
 import tempfile
@@ -9,6 +11,7 @@
 import triton
 from triton.backends.compiler import GPUTarget
 from triton.backends.nvidia.driver import include_dirs, library_dirs
+from triton._internal_testing import is_cuda, is_hip
 
 kernel_utils_src = """
 import triton
@@ -273,6 +276,20 @@ def generate_matmul_test_data(dir, M, N, K):
     return a, b, a_path, b_path, c_path
 
 
+def check_hasco_binary_str(tmp_dir: str, dtype: str):
+    # Linking is not yet enabled on HIP backend so just check compilation for now.
+    h_files = glob.glob(f"matmul_{dtype}.*.h", root_dir=tmp_dir)
+    cpp_files = glob.glob(f"matmul_{dtype}.*.cpp", root_dir=tmp_dir)
+    assert len(h_files) == 1, "Expected one .h file"
+    assert len(cpp_files) == 1, "Expected one .cpp file"
+    pattern = re.compile(r'HSACO_NAME\[(\d+)\]')
+    with open(os.path.join(tmp_dir, cpp_files[0]), "r") as cpp_file:
+        content = cpp_file.read()
+        matches = pattern.findall(content)
+        assert len(matches) == 1, "Expected one HSACO_NAME definition"
+        assert int(matches[0]) > 16, "Expected valid HSACO object binary string"
+
+
 # Test edge case where the provided kernel signature has no specializations
 def test_compile_link_matmul_no_specialization():
     np.random.seed(3)
@@ -283,6 +300,10 @@ def test_compile_link_matmul_no_specialization():
 
         kernel_path = write_triton_kernels(tmp_dir, kernel_src, kernel_utils_src)
         compile_aot_kernel_no_specialization(tmp_dir, kernel_path, dtype, BM, BN, BK)
+        if is_hip():
+            check_hasco_binary_str(tmp_dir, dtype)
+            return
+
         link_aot_kernels(tmp_dir)
 
         # compile test case
@@ -314,6 +335,9 @@ def test_compile_link_matmul():
 
         kernel_path = write_triton_kernels(tmp_dir, kernel_src, kernel_utils_src)
         compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[(":16", ":16")])
+        if is_hip():
+            check_hasco_binary_str(tmp_dir, dtype)
+            return
         link_aot_kernels(tmp_dir)
 
         # compile test case
@@ -345,6 +369,10 @@ def test_launcher_has_no_available_kernel():
 
         kernel_path = write_triton_kernels(tmp_dir, kernel_src, kernel_utils_src)
         compile_aot_kernels(tmp_dir, kernel_path, dtype, BM, BN, BK, ha_hb_hints=[(":1", ":1")])
+        if is_hip():
+            check_hasco_binary_str(tmp_dir, dtype)
+            return
+
         link_aot_kernels(tmp_dir)
 
         # compile test case
@@ -371,6 +399,7 @@ def test_launcher_has_no_available_kernel():
         assert "kernel launch failed" in result.stderr
 
 
+@pytest.mark.skipif(not is_cuda(), reason="Requires CUDA")
 def test_compile_link_autotune_matmul():
     np.random.seed(3)
 
@@ -419,19 +448,25 @@ def test_compile_link_autotune_matmul():
             np.testing.assert_allclose(c_tri, c_ref * c_ref, atol=1e-4, rtol=1e-4)
 
 
-def test_ttgir_to_ptx():
+def test_ttgir_to_asm():
     src = """
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.num-ctas" = 1 : i32} {
-  tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr<i32>, %arg1: !tt.ptr<i32>) {
+module attributes {{"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = {warp_size} : i32, "ttg.num-ctas" = 1 : i32}} {{
+  tt.func public @sum_kernel_0d1d(%arg0: !tt.ptr<i32>, %arg1: !tt.ptr<i32>) {{
     tt.return
-  }
-}
+  }}
+}}
 """
+    target = GPUTarget("hip", "gfx942", 64) if is_hip() else GPUTarget("cuda", 80, 32)
     with tempfile.TemporaryDirectory() as tmp_dir:
         kernel_path = os.path.join(tmp_dir, "empty_kernel.ttgir")
         with open(kernel_path, "w") as fp:
-            fp.write(src)
-        k = triton.compile(kernel_path, target=GPUTarget("cuda", 80, 32))
-        ptx = k.asm["ptx"]
-        assert ".target sm_80" in ptx
-        assert ".address_size 64" in ptx
+            fp.write(src.format(warp_size=target.warp_size))
+        k = triton.compile(kernel_path, target=target)
+        if is_cuda():
+            ptx = k.asm["ptx"]
+            assert ".target sm_80" in ptx
+            assert ".address_size 64" in ptx
+        elif is_hip():
+            amdgcn = k.asm["amdgcn"]
+            assert '.amdgcn_target "amdgcn-amd-amdhsa--gfx942"' in amdgcn
+            assert '.wavefront_size: 64' in amdgcn
diff --git a/python/triton/backends/driver.py b/python/triton/backends/driver.py
@@ -15,6 +15,19 @@ class DriverBase(metaclass=ABCMeta):
     def is_active(self):
         pass
 
+    @abstractmethod
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        """
+        Converts a Triton type string to its corresponding C++ type string for this backend.
+
+        Args:
+            ty (str): The Triton type string. e.g., 'i32', '*fp16', 'fp32'.
+
+        Returns:
+            str: The C++ type string.
+        """
+        pass
+
     @abstractmethod
     def get_current_target(self):
         pass
diff --git a/python/triton/tools/compile.py b/python/triton/tools/compile.py
@@ -3,12 +3,29 @@
 import importlib.util
 import sys
 from argparse import ArgumentParser
+from dataclasses import dataclass
 from pathlib import Path
 from typing import List
 
 import triton
 import triton.backends
-from triton.backends.nvidia.driver import ty_to_cpp
+
+
+@dataclass
+class CompileArgs:
+    '''
+    A class to contain arguments from command-line parser.
+    '''
+    path: str = ''
+    kernel_name: str = ''
+    signature: str = ''
+    grid: str = ''
+    target: str | None = None
+    num_warps: int = 1
+    num_stages: int = 3
+    out_name: str | None = None
+    out_path: Path | None = None
+
 
 desc = """
 Triton ahead-of-time compiler:
@@ -36,23 +53,31 @@
 used to run this `compile.py` script
 """
 
-if __name__ == "__main__":
 
+def main():
     # command-line arguments
     parser = ArgumentParser(description=desc)
     parser.add_argument("path",
                         help="Path to Python source containing desired kernel in its scope. File will be executed.")
     parser.add_argument("--kernel-name", "-n", type=str, default="", help="Name of the kernel to compile",
                         required=True)
+    parser.add_argument(
+        "--target", "-t", type=str, default=None,
+        help="The target to compile towards, in format of '<backend>:<arch>:<warp-size>'; "
+        "e.g., 'cuda:80:32', 'hip:gfx942:64'. Default to None, which means using current machine's GPU target")
     parser.add_argument("--num-warps", "-w", type=int, default=1, help="Number of warps to launch the kernel")
     parser.add_argument("--num-stages", "-ns", type=int, default=3,
                         help="Number of stages (meta-parameter of the kernel)")
     parser.add_argument("--out-name", "-on", type=str, default=None, help="Out name for the compiled kernel")
     parser.add_argument("--out-path", "-o", type=Path, default=None, help="Out filename")
     parser.add_argument("--signature", "-s", type=str, help="Signature of the kernel", required=True)
     parser.add_argument("--grid", "-g", type=str, help="Launch grid of the kernel", required=True)
-    args = parser.parse_args()
+    cli_args = parser.parse_args()
+    args = CompileArgs(**vars(cli_args))  # A sanity check to ensure class CompileArgs is updated as well.
+    compile_kernel(args)
 
+
+def compile_kernel(args: CompileArgs):
     out_name = args.out_name if args.out_name else args.kernel_name
     out_path = args.out_path if args.out_path else Path(out_name)
 
@@ -108,9 +133,15 @@ def constexpr(s):
         assert h in [1, 16], f"Only 1 and 16 are valid hints, got {h}"
     attrs = {k: [["tt.divisibility", 16]] for k, v in hints.items() if v == 16}
     src = triton.compiler.ASTSource(fn=kernel, constexprs=constants, signature=signature, attrs=attrs)
-    opts = {"num_warps": args.num_warps, "num_stages": args.num_stages}
-    ccinfo = triton.compile(src, options=opts)
-    if ccinfo.metadata.global_scratch_size > 0:
+
+    target = triton.backends.compiler.GPUTarget(*args.target.split(":")) \
+        if args.target else triton.runtime.driver.active.get_current_target()
+    backend = triton.compiler.make_backend(target)
+    kwargs = {"num_warps": args.num_warps, "num_stages": args.num_stages}
+    options = backend.parse_options(kwargs)
+    ccinfo = triton.compile(src, target=target, options=options.__dict__)
+
+    if getattr(ccinfo.metadata, "global_scratch_size", 0) > 0:
         raise RuntimeError("AOT compiling kernels with global scratch requirements is not yet implemented")
 
     arg_names = []
@@ -136,8 +167,12 @@ def constexpr(s):
         if hints.get((i, ), None) == 16:
             suffix += 'd'
     func_name = '_'.join([out_name, sig_hash, suffix])
-    asm = ccinfo.asm["cubin"]  # store binary data once
+    asm = ccinfo.asm[backend.binary_ext]  # store binary data once
+
     hex_ = str(binascii.hexlify(asm))[2:-1]
+
+    ty_to_cpp = triton.runtime.driver.active.map_python_to_cpp_type
+
     params = {
         "kernel_name": func_name,
         "triton_kernel_name": args.kernel_name,
@@ -156,7 +191,18 @@ def constexpr(s):
         "gridZ": grid[2],
         "_placeholder": "",
     }
-    for ext in ['h', 'c']:
-        template_path = Path(__file__).parent / "extra" / "cuda" / f"compile.{ext}"
-        with out_path.with_suffix(f".{sig_hash}_{suffix}.{ext}").open("w") as fp:
-            fp.write(Path(template_path).read_text().format(**params))
+    output_files = []
+    backend_name = target.backend
+    template_dir = Path(__file__).parent / "extra" / backend_name
+    for template_path in template_dir.glob('compile.*'):
+        ext = template_path.suffix
+        output_file = out_path.with_suffix(f".{sig_hash}_{suffix}{ext}")
+        with output_file.open("w") as fp:
+            fp.write(template_path.read_text().format(**params))
+        output_files.append(output_file)
+
+    return func_name, output_files
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
@@ -585,6 +585,9 @@ def is_active():
         except ImportError:
             return False
 
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+
     def get_current_target(self):
         device = self.get_current_device()
         device_properties = self.utils.get_device_properties(device)
diff --git a/third_party/amd/tools/hip/compile.cpp b/third_party/amd/tools/hip/compile.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+/* clang-format off */
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <hip/hip_runtime.h>
+
+// helpers to check for hip errors
+#define HIP_CHECK(ans) {{\
+    gpuAssert((ans), __FILE__, __LINE__);\
+  }}\
+
+static inline void gpuAssert(hipError_t code, const char *file, int line) {{
+  if (code != hipSuccess) {{
+    const char *prefix = "Triton Error [HIP]: ";
+    const char *str;
+    hipDrvGetErrorString(code, &str);
+    char err[1024] = {{0}};
+    strcat(err, prefix);
+    strcat(err, str);
+    printf("%s\\n", err);
+    exit(code);
+  }}
+}}
+
+// globals
+#define HSACO_NAME {kernel_name}_hsaco
+hipModule_t {kernel_name}_mod = nullptr;
+hipFunction_t {kernel_name}_func = nullptr;
+unsigned char HSACO_NAME[{bin_size}] = {{ {bin_data} }};
+
+
+void unload_{kernel_name}(void) {{
+    HIP_CHECK(hipModuleUnload({kernel_name}_mod));
+}}
+
+
+void load_{kernel_name}() {{
+    int dev = 0;
+    void *bin = (void *)&HSACO_NAME;
+    int shared = {shared};
+    HIP_CHECK(hipModuleLoadData(&{kernel_name}_mod, bin));
+    HIP_CHECK(hipModuleGetFunction(&{kernel_name}_func, {kernel_name}_mod, "{triton_kernel_name}"));
+}}
+
+/*
+{kernel_docstring}
+*/
+hipError_t {kernel_name}(hipStream_t stream, {signature}) {{
+    if ({kernel_name}_func == nullptr)
+       load_{kernel_name}();
+    unsigned int gX = {gridX};
+    unsigned int gY = {gridY};
+    unsigned int gZ = {gridZ};
+    hipDeviceptr_t global_scratch = 0;
+    void *args[{num_args}] = {{ {arg_pointers} }};
+    // TODO: shared memory
+    if(gX * gY * gZ > 0)
+      return hipModuleLaunchKernel({kernel_name}_func, gX, gY, gZ, {num_warps} * warpSize, 1, 1, {shared}, stream, args, nullptr);
+    else
+      return hipErrorInvalidValue;
+}}
diff --git a/third_party/amd/tools/hip/compile.h b/third_party/amd/tools/hip/compile.h
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+
+void unload_{kernel_name}(void);
+void load_{kernel_name}(void);
+hipError_t{_placeholder} {kernel_name}(hipStream_t stream, {signature});
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
@@ -683,6 +683,9 @@ def is_active():
         except ImportError:
             return False
 
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+
     def get_benchmarker(self):
         from triton.testing import do_bench
         return do_bench