[#5153] This is workaround for feature requirement in #5153. The IGC build flag is updated when the large GRF mode is used in loading SPIRV kernel when register spill size > 1000.

chengjunlu · chengjunlu · commit b3419c1fd859 · 2025-10-29T12:29:40.000Z
Signed-off-by: etaf &lt;xinan.lin@intel.com&gt;
Co-authored-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/python/test/unit/intel/test_regressions.py b/python/test/unit/intel/test_regressions.py
@@ -50,7 +50,7 @@ def test_regression_4441(device, tmp_path: pathlib.Path):
     # L0 build module failed. Log: IGC: Internal Compiler Error: Segmentation violation
     # Error during Intel loadBinary: Triton Error [ZE]: 0x70000004
     # RuntimeError: Triton Error [ZE]: 0x70000004
-    module, function, n_regs, n_spills, n_max_threads = driver.active.utils.load_binary(
+    module, function, n_regs, n_spills, n_max_threads, _ = driver.active.utils.load_binary(
         kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags,
         not kernel.metadata.generate_native_code, device)
 
@@ -1911,6 +1911,6 @@ def test_regression_5374(device, tmp_path: pathlib.Path):
     # L0 build module failed. Log: IGC: Internal Compiler Error: Segmentation violation
     # Error during Intel loadBinary: Triton Error [ZE]: 0x70000004
     # RuntimeError: Triton Error [ZE]: 0x70000004
-    module, function, n_regs, n_spills, n_max_threads = driver.active.utils.load_binary(
+    module, function, n_regs, n_spills, n_max_threads, _ = driver.active.utils.load_binary(
         kernel.name, kernel.kernel, kernel.metadata.shared, kernel.metadata.build_flags,
         not kernel.metadata.generate_native_code, device)
diff --git a/python/triton/compiler/compiler.py b/python/triton/compiler/compiler.py
@@ -465,9 +465,14 @@ def raise_(err):
         if knobs.runtime.kernel_load_start_hook is not None:
             knobs.runtime.kernel_load_start_hook(self.module, self.function, self.name, self.metadata_group, self.hash)
         # TODO: n_regs, n_spills should be metadata generated when calling `ptxas`
-        self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads = driver.active.utils.load_binary(
+        # FIXME: remove the workaround for updating the build flags in loading binary
+        self.module, self.function, self.n_regs, self.n_spills, self.n_max_threads, new_build_flags = driver.active.utils.load_binary(
             self.name, self.kernel, self.metadata.shared, self.metadata.build_flags,
             not self.metadata.generate_native_code, device)
+
+        if new_build_flags != self.metadata.build_flags:
+            self.metadata = self.metadata._replace(build_flags=new_build_flags)
+
         if hasattr(self.metadata, "threads_per_warp"):
             warp_size = self.metadata.threads_per_warp
         else:
diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
@@ -310,8 +310,8 @@ extern "C" EXPORT_FUNC PyObject *load_binary(PyObject *args) {
     auto kernel_bundle_py = PyCapsule_New(reinterpret_cast<void *>(mod),
                                           "kernel_bundle", freeKernelBundle);
 
-    return Py_BuildValue("(OOiii)", kernel_bundle_py, kernel_py, n_regs,
-                         n_spills, n_max_threads);
+    return Py_BuildValue("(OOiiis)", kernel_bundle_py, kernel_py, n_regs,
+                         n_spills, n_max_threads, build_flags().data());
 
   } catch (const std::exception &e) {
     PyGILState_STATE gil_state;