[typehint][python] Unify compile_module_from_src from nvidia + amd backends, make more defensive (triton-lang#6775)

danzimm · zwu-2025 · commit 6bc0661fc999 · 2025-05-27T13:42:50.000-05:00
I started looking at adding typehints to python/triton/backends and
noticed there's a bit of duplicated code across the nvidia and amd
implementations. To start, I think we can unify the
`compile_module_from_src` since it appears the implementations are
identical.

Additionally, I added some extra defensive checks to the implementation
in case the cache returns a faulty artifact. Ideally this shouldn't
happen, but figure better to have a cache miss than a crash. We can
remove this if it seems superfluous.

I plan to add some tests, but wanted to open the PR ahead of time for
visibility (tests will be verifying compilation / loading succeeds &amp;
that fallback to compiling with bad artifact succeeds)
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ files = [
     "python/triton/runtime/build.py",
     "python/triton/_utils.py",
     "python/test/unit/test_knobs.py",
+    "python/test/unit/runtime/test_build.py",
     "python/test/unit/runtime/test_compilation_listener.py",
 ]
 exclude = ["/build/"]
diff --git a/python/test/unit/runtime/test_build.py b/python/test/unit/runtime/test_build.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import pytest
+import tempfile
+
+from pathlib import Path
+
+import triton
+
+from triton.runtime.build import compile_module_from_src
+
+TEST_MODULE_C = """
+#include <Python.h>
+#include <string.h>
+
+static PyObject* go(PyObject* self, PyObject* args) {
+    const char *command;
+    if (!PyArg_ParseTuple(args, "s", &command))
+        return NULL;
+
+    const char* res;
+    if (strcmp(command, "hello") == 0) {
+        res = "hiya";
+    } else {
+        res = "huh";
+    }
+    return PyUnicode_FromString(res);
+}
+
+static PyMethodDef ModuleMethods[] = {
+  {"go", go, METH_VARARGS, "test_module.go for testing"},
+  {NULL, NULL, 0, NULL}
+};
+
+static struct PyModuleDef ModuleDef = {
+  PyModuleDef_HEAD_INIT,
+  "test_module",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+};
+
+PyMODINIT_FUNC PyInit_test_module(void) {
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {
+    return NULL;
+  }
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}
+"""
+
+
+def test_compile_module(fresh_triton_cache):
+    mod = compile_module_from_src(TEST_MODULE_C, "test_module")
+
+    with pytest.raises(Exception):
+        mod.go()
+
+    assert mod.go("huh") == "huh"
+    assert mod.go("hello") == "hiya"
+
+    # Make sure the module is cached
+    mod2 = compile_module_from_src(TEST_MODULE_C, "test_module")
+    assert mod2.__file__ == mod.__file__
+
+
+def test_compile_module_bad_cache(fresh_knobs_except_libraries):
+    with tempfile.TemporaryDirectory() as tmpd:
+        tmp = Path(tmpd)
+        called_get_file = False
+
+        class InvalidFileCacheManager(triton.runtime.cache.FileCacheManager):
+
+            def get_file(self, filename: str) -> str | None:
+                nonlocal called_get_file
+                called_get_file = True
+                (tmp / filename).write_text("not an so")
+                return str(tmp / filename)
+
+        # First corrupt the cache
+        fresh_knobs_except_libraries.cache.manager_class = InvalidFileCacheManager
+
+        mod = compile_module_from_src(TEST_MODULE_C, "test_module")
+        assert called_get_file
+
+        with pytest.raises(Exception):
+            mod.go()
+
+        assert mod.go("huh") == "huh"
+        assert mod.go("hello") == "hiya"
diff --git a/python/test/unit/tools/test_aot.py b/python/test/unit/tools/test_aot.py
@@ -8,7 +8,7 @@
 
 import triton
 from triton.backends.compiler import GPUTarget
-from triton.backends.nvidia.driver import include_dir, library_dirs
+from triton.backends.nvidia.driver import include_dirs, library_dirs
 
 kernel_utils_src = """
 import triton
@@ -100,7 +100,7 @@ def kernel(C, A, B, M, N, K,
 def gen_kernel_library(dir, libname):
     c_files = glob.glob(os.path.join(dir, "*.c"))
     subprocess.run(
-        ["gcc"] + c_files + ["-I", include_dir[0], "-c", "-fPIC"],
+        ["gcc"] + c_files + ["-I", include_dirs[0], "-c", "-fPIC"],
         check=True,
         cwd=dir,
     )
@@ -175,7 +175,7 @@ def gen_test_bin(dir, M, N, K, exe="test", algo_id=0):
         file.write(src)
 
     command = ["gcc", "test.c"]
-    for inc_dir in include_dir:
+    for inc_dir in include_dirs:
         command.extend(["-I", inc_dir])
     for lib_dir in library_dirs():
         command.extend(["-L", lib_dir])
diff --git a/python/triton/backends/driver.py b/python/triton/backends/driver.py
@@ -1,14 +1,7 @@
-import functools
 from abc import ABCMeta, abstractmethod
 from typing import Callable, List, Protocol, Sequence
 
 
-@functools.lru_cache()
-def platform_key():
-    from platform import machine, system, architecture
-    return ",".join([machine(), system(), *architecture()])
-
-
 class Benchmarker(Protocol):
 
     def __call__(self, kernel_call: Callable, *, quantiles: List[float], **kwargs) -> Sequence[float]:
diff --git a/python/triton/runtime/build.py b/python/triton/runtime/build.py
@@ -1,8 +1,18 @@
-import sysconfig
+from __future__ import annotations
+
+import functools
+import hashlib
+import importlib.util
+import logging
 import os
 import shutil
 import subprocess
+import sysconfig
+import tempfile
+
+from types import ModuleType
 
+from .cache import get_cache_manager
 from .. import knobs
 
 
@@ -40,3 +50,43 @@ def _build(name: str, src: str, srcdir: str, library_dirs: list[str], include_di
     cc_cmd += [f"-I{dir}" for dir in include_dirs if dir is not None]
     subprocess.check_call(cc_cmd, stdout=subprocess.DEVNULL)
     return so
+
+
+@functools.lru_cache
+def platform_key() -> str:
+    from platform import machine, system, architecture
+    return ",".join([machine(), system(), *architecture()])
+
+
+def _load_module_from_path(name: str, path: str) -> ModuleType:
+    spec = importlib.util.spec_from_file_location(name, path)
+    if not spec or not spec.loader:
+        raise RuntimeError(f"Failed to load newly compiled {name} from {path}")
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    return mod
+
+
+def compile_module_from_src(src: str, name: str, library_dirs: list[str] | None = None,
+                            include_dirs: list[str] | None = None, libraries: list[str] | None = None) -> ModuleType:
+    key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
+    cache = get_cache_manager(key)
+    suffix = sysconfig.get_config_var("EXT_SUFFIX")
+    cache_path = cache.get_file(f"{name}{suffix}")
+
+    if cache_path is not None:
+        try:
+            return _load_module_from_path(name, cache_path)
+        except (RuntimeError, ImportError):
+            log = logging.getLogger(__name__)
+            log.warning(f"Triton cache error: compiled module {name}.so could not be loaded")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        src_path = os.path.join(tmpdir, name + ".c")
+        with open(src_path, "w") as f:
+            f.write(src)
+        so = _build(name, src_path, tmpdir, library_dirs or [], include_dirs or [], libraries or [])
+        with open(so, "rb") as f:
+            cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)
+
+    return _load_module_from_path(name, cache_path)
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
@@ -1,20 +1,16 @@
 import functools
 import os
-import hashlib
 import subprocess
-import sysconfig
-import tempfile
 import re
 from pathlib import Path
-from triton.runtime.build import _build
 from triton import knobs
-from triton.runtime.cache import get_cache_manager
 from triton.backends.compiler import GPUTarget
-from triton.backends.driver import GPUDriver, platform_key
+from triton.backends.driver import GPUDriver
+from triton.runtime.build import compile_module_from_src
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 dirname = os.path.dirname(os.path.realpath(__file__))
-include_dir = [os.path.join(dirname, "include")]
+include_dirs = [os.path.join(dirname, "include")]
 
 
 def _find_already_mmapped_dylib_on_linux(lib_name):
@@ -133,26 +129,6 @@ def _get_path_to_hip_runtime_dylib():
     raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
 
 
-def compile_module_from_src(src, name):
-    key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
-    cache = get_cache_manager(key)
-    suffix = sysconfig.get_config_var("EXT_SUFFIX")
-    cache_path = cache.get_file(f"{name}{suffix}")
-    if cache_path is None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            src_path = os.path.join(tmpdir, "main.c")
-            with open(src_path, "w") as f:
-                f.write(src)
-            so = _build(name, src_path, tmpdir, [], include_dir, [])
-            with open(so, "rb") as f:
-                cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)
-    import importlib.util
-    spec = importlib.util.spec_from_file_location(name, cache_path)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    return mod
-
-
 class HIPUtils(object):
 
     def __new__(cls):
@@ -167,7 +143,7 @@ def __init__(self):
         # This way we don't need to escape-quote C code curly brackets and we can replace
         # exactly once.
         src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
-        mod = compile_module_from_src(src, "hip_utils")
+        mod = compile_module_from_src(src=src, name="hip_utils", include_dirs=include_dirs)
         self.load_binary = mod.load_binary
         self.get_device_properties = mod.get_device_properties
 
@@ -560,7 +536,7 @@ def __init__(self, src, metadata):
         constants = {arg_idx(idx): value for idx, value in constants.items()}
         signature = {idx: value for idx, value in src.signature.items()}
         src = make_launcher(constants, signature, metadata.warp_size)
-        mod = compile_module_from_src(src, "__triton_launcher")
+        mod = compile_module_from_src(src=src, name="__triton_launcher", include_dirs=include_dirs)
         has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
 
         self.launch = wrap_handle_tensor_descriptor(mod.launch) if has_tensor_desc_arg else mod.launch
diff --git a/third_party/nvidia/backend/driver.py b/third_party/nvidia/backend/driver.py
@@ -1,24 +1,20 @@
 import functools
 import operator
 import os
-import sysconfig
-import hashlib
 import subprocess
-import tempfile
 import triton
 import re
 from pathlib import Path
 from triton import knobs
-from triton.runtime.build import _build
-from triton.runtime.cache import get_cache_manager
+from triton.runtime.build import compile_module_from_src
 from triton.runtime import _allocation
 from triton.backends.compiler import GPUTarget
-from triton.backends.driver import GPUDriver, platform_key
+from triton.backends.driver import GPUDriver
 
 from triton.tools.tensor_descriptor import TensorDescriptor
 
 dirname = os.path.dirname(os.path.realpath(__file__))
-include_dir = [os.path.join(dirname, "include")]
+include_dirs = [os.path.join(dirname, "include")]
 libdevice_dir = os.path.join(dirname, "lib")
 libraries = ['cuda']
 
@@ -52,26 +48,6 @@ def library_dirs():
     return [libdevice_dir, *libcuda_dirs()]
 
 
-def compile_module_from_src(src, name):
-    key = hashlib.sha256((src + platform_key()).encode("utf-8")).hexdigest()
-    cache = get_cache_manager(key)
-    suffix = sysconfig.get_config_var("EXT_SUFFIX")
-    cache_path = cache.get_file(f"{name}{suffix}")
-    if cache_path is None:
-        with tempfile.TemporaryDirectory() as tmpdir:
-            src_path = os.path.join(tmpdir, "main.c")
-            with open(src_path, "w") as f:
-                f.write(src)
-            so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
-            with open(so, "rb") as f:
-                cache_path = cache.put(f.read(), f"{name}{suffix}", binary=True)
-    import importlib.util
-    spec = importlib.util.spec_from_file_location(name, cache_path)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    return mod
-
-
 # ------------------------
 # Utils
 # ------------------------
@@ -85,7 +61,13 @@ def __new__(cls):
         return cls.instance
 
     def __init__(self):
-        mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
+        mod = compile_module_from_src(
+            src=Path(os.path.join(dirname, "driver.c")).read_text(),
+            name="cuda_utils",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
         self.load_binary = mod.load_binary
         self.get_device_properties = mod.get_device_properties
         self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
@@ -643,7 +625,13 @@ def __init__(self, src, metadata):
         signature = {idx: value for idx, value in src.signature.items()}
         tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
         src = make_launcher(constants, signature, tensordesc_meta)
-        mod = compile_module_from_src(src, "__triton_launcher")
+        mod = compile_module_from_src(
+            src=src,
+            name="__triton_launcher",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
         has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
 
         self.num_ctas = functools.reduce(operator.mul, metadata.cluster_dims, 1)

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ files = [`
`9`	`9`	`"python/triton/runtime/build.py",`
`10`	`10`	`"python/triton/_utils.py",`
`11`	`11`	`"python/test/unit/test_knobs.py",`
	`12`	`+ "python/test/unit/runtime/test_build.py",`
`12`	`13`	`"python/test/unit/runtime/test_compilation_listener.py",`
`13`	`14`	`]`
`14`	`15`	`exclude = ["/build/"]`