[NFI]: Update driver.py (#4819)

etiotto · web-flow · commit 21933fbff79a · 2025-08-01T15:30:25.000-04:00
Making this file implementation closer to the upstream one.

Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -114,7 +114,7 @@ def __init__(self, target: tuple) -> None:
         if not isinstance(target.arch, dict):
             raise TypeError("target.arch is not a dict")
         dirname = os.path.dirname(os.path.realpath(__file__))
-        mod = compile_module_from_src(Path(os.path.join(dirname, "arch_parser.c")).read_text(), "arch_utils")
+        mod = compile_module_from_src(src=Path(os.path.join(dirname, "arch_parser.c")).read_text(), name="arch_utils")
         self.device_arch = knobs.intel.device_arch or mod.parse_device_arch(target.arch.get('architecture', 0))
         self.properties = self.parse_target(target.arch)
         self.binary_ext = "spv"
diff --git a/third_party/intel/backend/driver.py b/third_party/intel/backend/driver.py
@@ -9,7 +9,7 @@
 from functools import cached_property
 
 from triton import knobs
-from triton.runtime.build import _build, platform_key
+from triton.runtime.build import _build, platform_key, _load_module_from_path
 from triton.runtime.cache import get_cache_manager
 from triton.backends.compiler import GPUTarget
 from triton.backends.driver import DriverBase
@@ -252,7 +252,7 @@ def __del__(self):
             ctypes.windll.kernel32.FreeLibrary(handle)
 
 
-def compile_module_from_src(src, name):
+def compile_module_from_src(src: str, name: str):
     hasher = hashlib.sha256(__CACHE_VERSION.encode("utf-8"))
     hasher.update((src + platform_key()).encode("utf-8"))
     key = hasher.hexdigest()
@@ -278,18 +278,14 @@ def compile_module_from_src(src, name):
 
     if name == 'arch_utils':
         return ArchParser(cache_path)
-    elif name == 'spirv_utils':
+    if name == 'spirv_utils':
         return SpirvUtils(cache_path)
-    elif name == '__triton_launcher':
+    if name == '__triton_launcher':
         return TritonLauncher(cache_path)
-    elif name == 'proton_utils':
+    if name == 'proton_utils':
         return cache_path
 
-    import importlib.util
-    spec = importlib.util.spec_from_file_location(name, cache_path)
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    return mod
+    return _load_module_from_path(name, cache_path)
 
 
 # ------------------------
@@ -308,12 +304,12 @@ def __init__(self):
         dirname = os.path.dirname(os.path.realpath(__file__))
         # we save `spirv_utils` module so that the destructor is not called prematurely, which will unload the dll
         # and can cause `Fatal Python error: Segmentation fault`
-        self.mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "spirv_utils")
-        self.load_binary = self.mod.load_binary
-        self.get_device_properties = self.mod.get_device_properties
-        self.device_count = self.mod.init_devices(self.get_sycl_queue())
-        self.wait_on_sycl_queue = self.mod.wait_on_sycl_queue
-        self.has_opencl_extension = self.mod.has_opencl_extension
+        mod = compile_module_from_src(src=Path(os.path.join(dirname, "driver.c")).read_text(), name="spirv_utils")
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.device_count = mod.init_devices(self.get_sycl_queue())
+        self.wait_on_sycl_queue = mod.wait_on_sycl_queue
+        self.has_opencl_extension = mod.has_opencl_extension
 
     def get_current_device(self):
         import torch
@@ -369,6 +365,8 @@ def ty_to_cpp(ty):
     "fp64": "pack_fp64",
 }
 
+_BASE_ARGS_FORMAT = "iiiOOOOOO"
+
 
 def make_launcher(constants, signature):
 
@@ -411,12 +409,11 @@ def format_of(ty):
         }[ty_to_cpp(ty)]
 
     args_format = ''.join([format_of(ty) for ty in signature.values()])
-    format = "iiiOOOOOO" + args_format
+    format = _BASE_ARGS_FORMAT + args_format
     signature = ','.join(map(_serialize_signature, signature.values()))
     signature = list(filter(bool, signature.split(',')))
     signature = {i: s for i, s in enumerate(signature)}
     args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
-
     # Record the end of regular arguments;
     # subsequent arguments are architecture-specific descriptors.
     arg_decl_list = []
@@ -706,15 +703,6 @@ def format_of(ty):
     return src
 
 
-def serialize_kernel_metadata(arg, args_dict):
-    args_dict['num_warps'] = arg.num_warps
-    args_dict['threads_per_warp'] = arg.threads_per_warp
-    args_dict['shared_memory'] = arg.shared
-    args_dict['kernel_name'] = arg.name
-    args_dict['spv_name'] = f"{arg.name}.spv"
-    args_dict['build_flags'] = arg.build_flags
-
-
 def serialize_args(args, constants, signature):
     import torch
     import numbers
@@ -723,6 +711,14 @@ def serialize_args(args, constants, signature):
         os.makedirs(dir_path)
         print(f"Path to directory consisting of SPIR-V Runner data: {dir_path}")
 
+    def serialize_kernel_metadata(arg, args_dict):
+        args_dict['num_warps'] = arg.num_warps
+        args_dict['threads_per_warp'] = arg.threads_per_warp
+        args_dict['shared_memory'] = arg.shared
+        args_dict['kernel_name'] = arg.name
+        args_dict['spv_name'] = f"{arg.name}.spv"
+        args_dict['build_flags'] = arg.build_flags
+
     cnt = 0
     args_dict = {"gridX": int(args[cnt]), "gridY": int(args[cnt + 1]), "gridZ": int(args[cnt + 2])}
     # 3: stream
@@ -774,7 +770,7 @@ def __init__(self, src, metadata):
         self.constants = {arg_idx(idx): value for idx, value in constants.items()}
         self.signature = {idx: value for idx, value in src.signature.items()}
         src = make_launcher(self.constants, self.signature)
-        self.mod = compile_module_from_src(src, "__triton_launcher")
+        self.mod = compile_module_from_src(src=src, name="__triton_launcher")
         # Serialize KernelArguments for SPIR-V Runner
         self.serialize_kernel_args = knobs.intel.dump_spirv_kernel_args
 
@@ -788,6 +784,7 @@ class XPUDriver(DriverBase):
 
     def __init__(self):
         self.launcher_cls = XPULauncher
+        super().__init__()
 
     def __getattr__(self, name):
         # Lazily initialize utils to avoid unnecessary XPU runtime invocations.
@@ -805,45 +802,46 @@ def get_current_stream(self, device):
         import torch
         return torch.xpu.current_stream().sycl_queue
 
-    def update_advanced_features(self, device, dev_property):
-        if knobs.intel.device_extensions:
-            # May be useful when using the `TRITON INTEL_DEVICE_ARCH` environment variable
-            # to be able to flexibly turn on/off the advanced feature.
-            supported_extensions = set()
-            supported_extensions.update(knobs.intel.device_extensions.split(" "))
-            dev_property[
-                "has_subgroup_matrix_multiply_accumulate"] = "cl_intel_subgroup_matrix_multiply_accumulate" in supported_extensions
-            dev_property[
-                "has_subgroup_matrix_multiply_accumulate_tensor_float32"] = "cl_intel_subgroup_matrix_multiply_accumulate_tensor_float32" in supported_extensions
-            dev_property["has_subgroup_2d_block_io"] = "cl_intel_subgroup_2d_block_io" in supported_extensions
-            dev_property["has_bfloat16_conversions"] = "cl_intel_bfloat16_conversions" in supported_extensions
-        else:
-            check = self.utils.has_opencl_extension
-            # FIXME: eventually even LTS driver will support OpenCL extensions.
-            # Please remove this after upgrading to a new version.
-            # https://github.com/intel/intel-xpu-backend-for-triton/issues/4708
-            is_lts = "1.3" in dev_property["driver_version"]
-            dev_property["has_subgroup_matrix_multiply_accumulate"] = check(
-                device, b"cl_intel_subgroup_matrix_multiply_accumulate") if not is_lts else False
-            dev_property["has_subgroup_matrix_multiply_accumulate_tensor_float32"] = check(
-                device, b"cl_intel_subgroup_matrix_multiply_accumulate_tensor_float32") if not is_lts else False
-            dev_property["has_subgroup_2d_block_io"] = check(device,
-                                                             b"cl_intel_subgroup_2d_block_io") if not is_lts else False
-            dev_property["has_bfloat16_conversions"] = check(device,
-                                                             b"cl_intel_bfloat16_conversions") if not is_lts else False
-
     def get_current_target(self):
         import torch
         device = self.get_current_device()
         dev_property = torch.xpu.get_device_capability(device)
-        self.update_advanced_features(device, dev_property)
+
+        def update_advanced_features(device, dev_property):
+            if knobs.intel.device_extensions:
+                # May be useful when using the `TRITON INTEL_DEVICE_ARCH` environment variable
+                # to be able to flexibly turn on/off the advanced feature.
+                supported_extensions = set()
+                supported_extensions.update(knobs.intel.device_extensions.split(" "))
+                dev_property[
+                    "has_subgroup_matrix_multiply_accumulate"] = "cl_intel_subgroup_matrix_multiply_accumulate" in supported_extensions
+                dev_property[
+                    "has_subgroup_matrix_multiply_accumulate_tensor_float32"] = "cl_intel_subgroup_matrix_multiply_accumulate_tensor_float32" in supported_extensions
+                dev_property["has_subgroup_2d_block_io"] = "cl_intel_subgroup_2d_block_io" in supported_extensions
+                dev_property["has_bfloat16_conversions"] = "cl_intel_bfloat16_conversions" in supported_extensions
+            else:
+                check = self.utils.has_opencl_extension
+                # FIXME: eventually even LTS driver will support OpenCL extensions.
+                # Please remove this after upgrading to a new version.
+                # https://github.com/intel/intel-xpu-backend-for-triton/issues/4708
+                is_lts = "1.3" in dev_property["driver_version"]
+                dev_property["has_subgroup_matrix_multiply_accumulate"] = check(
+                    device, b"cl_intel_subgroup_matrix_multiply_accumulate") if not is_lts else False
+                dev_property["has_subgroup_matrix_multiply_accumulate_tensor_float32"] = check(
+                    device, b"cl_intel_subgroup_matrix_multiply_accumulate_tensor_float32") if not is_lts else False
+                dev_property["has_subgroup_2d_block_io"] = check(
+                    device, b"cl_intel_subgroup_2d_block_io") if not is_lts else False
+                dev_property["has_bfloat16_conversions"] = check(
+                    device, b"cl_intel_bfloat16_conversions") if not is_lts else False
+
+        update_advanced_features(device, dev_property)
         return GPUTarget("xpu", dev_property, warp_size=32)
 
     def build_proton_help_lib(self):
         from triton.backends.intel.driver import compile_module_from_src
 
         dirname = os.path.dirname(os.path.realpath(__file__))
-        return compile_module_from_src(Path(dirname).joinpath("proton_utils.cpp").read_text(), "proton_utils")
+        return compile_module_from_src(src=Path(dirname).joinpath("proton_utils.cpp").read_text(), name="proton_utils")
 
     def get_active_torch_device(self):
         import torch