From 897b125d40da5da49e3a9680eecee61037e463f9 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 26 Sep 2024 19:15:42 -0700
Subject: [PATCH 1/5] Refactor loadBinary to support n_regs

---
 third_party/intel/backend/driver.c | 223 +++++++++++++++--------------
 1 file changed, 114 insertions(+), 109 deletions(-)
diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
index aaabd95fc6..7b12e03a3f 100644
--- a/third_party/intel/backend/driver.c
+++ b/third_party/intel/backend/driver.c
@@ -26,25 +26,14 @@ static std::vector<ze_device_handle_t> g_devices;
 static std::vector<std::pair<sycl::device, ze_device_handle_t>>
     g_sycl_l0_device_list;
 
-static inline void gpuAssert(ze_result_t code) {
-  if (code != ZE_RESULT_SUCCESS) {
-    auto str = parseZeResultCode(code);
-    char err[1024] = {0};
-    strncat(err, str.c_str(), std::min(str.size(), size_t(1024)));
-    PyGILState_STATE gil_state;
-    gil_state = PyGILState_Ensure();
-    PyErr_SetString(PyExc_RuntimeError, err);
-    PyGILState_Release(gil_state);
-  }
-}
-
 template <typename T>
 static inline T checkSyclErrors(const std::tuple<T, ze_result_t> tuple) {
-  gpuAssert(std::get<1>(tuple));
-  if (PyErr_Occurred())
-    return nullptr;
-  else
+  const auto code = std::get<1>(tuple);
+  if (code != ZE_RESULT_SUCCESS) {
+    throw std::runtime_error(parseZeResultCode(code));
+  } else {
     return std::get<0>(tuple);
+  }
 }
 
 static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
@@ -113,6 +102,31 @@ void freeKernelBundle(PyObject *p) {
       PyCapsule_GetPointer(p, "kernel_bundle"));
 }
 
+template <typename L0_DEVICE, typename L0_CONTEXT>
+std::tuple<ze_module_handle_t, ze_kernel_handle_t, int32_t, int32_t>
+compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size,
+                        const std::string &kernel_name, L0_DEVICE l0_device,
+                        L0_CONTEXT l0_context, const std::string &build_flags,
+                        const bool is_spv) {
+  auto l0_module =
+      checkSyclErrors(create_module(l0_context, l0_device, binary_ptr,
+                                    binary_size, build_flags.c_str(), is_spv));
+
+  // Retrieve the kernel properties (e.g. register spills).
+  auto l0_kernel = checkSyclErrors(create_function(l0_module, kernel_name));
+
+  ze_kernel_properties_t props;
+  props.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
+  props.pNext = nullptr;
+  checkSyclErrors(
+      std::make_tuple(NULL, zeKernelGetProperties(l0_kernel, &props)));
+
+  int32_t n_spills = props.spillMemSize;
+  const int32_t n_regs = 0;
+
+  return std::make_tuple(l0_module, l0_kernel, n_regs, n_spills);
+}
+
 static PyObject *loadBinary(PyObject *self, PyObject *args) {
   const char *name, *build_flags;
   int shared;
@@ -130,106 +144,97 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
     return NULL;
   }
 
-  const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId];
-  const sycl::device sycl_device = sycl_l0_device_pair.first;
-
-  std::string kernel_name = name;
-  const size_t binary_size = PyBytes_Size(py_bytes);
-
-  uint8_t *binary_ptr = (uint8_t *)PyBytes_AsString(py_bytes);
-  const auto ctx = sycl_device.get_platform().ext_oneapi_get_default_context();
-  const auto l0_device =
-      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
-  const auto l0_context =
-      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
-
-  const auto use_native_code =
-      isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE"));
-  const bool is_spv = use_native_code ? !(*use_native_code) : true;
-
-  auto l0_module = checkSyclErrors(create_module(
-      l0_context, l0_device, binary_ptr, binary_size, build_flags, is_spv));
-
-  auto checkL0Errors = [&](auto l0_module) -> ze_kernel_handle_t {
-    if (PyErr_Occurred()) {
-      // check for errors from module creation
-      return NULL;
-    }
-    ze_kernel_handle_t l0_kernel =
-        checkSyclErrors(create_function(l0_module, kernel_name));
-    if (PyErr_Occurred()) {
-      // check for errors from kernel creation
-      return NULL;
-    }
-    return l0_kernel;
-  };
-
-  // Retrieve the kernel properties (e.g. register spills).
-  ze_kernel_handle_t l0_kernel = checkL0Errors(l0_module);
-  ze_kernel_properties_t props;
-  props.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES;
-  props.pNext = nullptr;
-  gpuAssert(zeKernelGetProperties(l0_kernel, &props));
-
-  int32_t n_spills = props.spillMemSize;
-  const int32_t n_regs = 0;
-
-  if (is_spv) {
-    constexpr int32_t max_reg_spill = 1000;
-    std::string build_flags_str(build_flags);
-    bool is_GRF_mode_specified = false;
-
-    // Check whether the GRF mode is specified by the build flags.
-    if (build_flags_str.find("-cl-intel-256-GRF-per-thread") !=
-            std::string::npos ||
-        build_flags_str.find("-cl-intel-128-GRF-per-thread") !=
-            std::string::npos ||
-        build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") !=
-            std::string::npos) {
-      is_GRF_mode_specified = true;
-    }
+  try {
+
+    const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId];
+    const sycl::device sycl_device = sycl_l0_device_pair.first;
+
+    const std::string kernel_name = name;
+    const size_t binary_size = PyBytes_Size(py_bytes);
+
+    uint8_t *binary_ptr = (uint8_t *)PyBytes_AsString(py_bytes);
+    const auto ctx =
+        sycl_device.get_platform().ext_oneapi_get_default_context();
+    const auto l0_device =
+        sycl::get_native<sycl::backend::ext_oneapi_level_zero>(sycl_device);
+    const auto l0_context =
+        sycl::get_native<sycl::backend::ext_oneapi_level_zero>(ctx);
+
+    const auto use_native_code =
+        isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE"));
+    const bool is_spv = use_native_code ? !(*use_native_code) : true;
+
+    auto [l0_module, l0_kernel, n_regs, n_spills] =
+        compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device,
+                                l0_context, build_flags, is_spv);
+
+    if (is_spv) {
+      constexpr int32_t max_reg_spill = 1000;
+      std::string build_flags_str(build_flags);
+      bool is_GRF_mode_specified = false;
+
+      // Check whether the GRF mode is specified by the build flags.
+      if (build_flags_str.find("-cl-intel-256-GRF-per-thread") !=
+              std::string::npos ||
+          build_flags_str.find("-cl-intel-128-GRF-per-thread") !=
+              std::string::npos ||
+          build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") !=
+              std::string::npos) {
+        is_GRF_mode_specified = true;
+      }
 
-    // If the register mode isn't set, and the number of spills is greater
-    // than the threshold, recompile the kernel using large GRF mode.
-    if (!is_GRF_mode_specified && n_spills > max_reg_spill) {
-      const std::optional<bool> debugEnabled =
+      // If the register mode isn't set, and the number of spills is greater
+      // than the threshold, recompile the kernel using large GRF mode.
+      if (!is_GRF_mode_specified && n_spills > max_reg_spill) {
+        const std::optional<bool> debugEnabled =
           isEnvValueBool(getStrEnv("TRITON_DEBUG"));
-      if (debugEnabled)
-        std::cout << "(I): Detected " << n_spills
-                  << " spills, recompiling kernel \"" << kernel_name
-                  << "\" using large GRF mode" << std::endl;
-
-      const std::string new_build_flags =
-          build_flags_str.append(" -cl-intel-256-GRF-per-thread");
-      l0_module = checkSyclErrors(
-          create_module(l0_context, l0_device, binary_ptr, binary_size,
-                        new_build_flags.c_str(), is_spv));
-
-      l0_kernel = checkL0Errors(l0_module);
-      gpuAssert(zeKernelGetProperties(l0_kernel, &props));
-      n_spills = props.spillMemSize;
-
+        if (debugEnabled)
+          std::cout << "(I): Detected " << n_spills
+                    << " spills, recompiling the kernel using large GRF mode"
+                    << std::endl;
+
+        const std::string new_build_flags =
+            build_flags_str.append(" -cl-intel-256-GRF-per-thread");
+
+        auto [l0_module, l0_kernel, n_regs, n_spills] = compileLevelZeroObjects(
+            binary_ptr, binary_size, kernel_name, l0_device, l0_context,
+            new_build_flags, is_spv);
+      
       if (debugEnabled)
         std::cout << "(I): Kernel has now " << n_spills << " spills"
                   << std::endl;
+      }
     }
-  }
 
-  auto mod = new sycl::kernel_bundle<sycl::bundle_state::executable>(
-      sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
-                               sycl::bundle_state::executable>(
-          {l0_module, sycl::ext::oneapi::level_zero::ownership::transfer},
-          ctx));
-  sycl::kernel *fun =
-      new sycl::kernel(sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
-          {*mod, l0_kernel, sycl::ext::oneapi::level_zero::ownership::transfer},
-          ctx));
-  auto kernel_py =
-      PyCapsule_New(reinterpret_cast<void *>(fun), "kernel", freeKernel);
-  auto kernel_bundle_py = PyCapsule_New(reinterpret_cast<void *>(mod),
-                                        "kernel_bundle", freeKernelBundle);
-
-  return Py_BuildValue("(OOii)", kernel_bundle_py, kernel_py, n_regs, n_spills);
+    auto mod = new sycl::kernel_bundle<sycl::bundle_state::executable>(
+        sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
+                                 sycl::bundle_state::executable>(
+            {l0_module, sycl::ext::oneapi::level_zero::ownership::transfer},
+            ctx));
+    sycl::kernel *fun = new sycl::kernel(
+        sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+            {*mod, l0_kernel,
+             sycl::ext::oneapi::level_zero::ownership::transfer},
+            ctx));
+    auto kernel_py =
+        PyCapsule_New(reinterpret_cast<void *>(fun), "kernel", freeKernel);
+    auto kernel_bundle_py = PyCapsule_New(reinterpret_cast<void *>(mod),
+                                          "kernel_bundle", freeKernelBundle);
+
+    return Py_BuildValue("(OOii)", kernel_bundle_py, kernel_py, n_regs,
+                         n_spills);
+
+  } catch (const std::exception &e) {
+    char err[1024] = {0};
+    std::string_view error_str(e.what());
+    strncat(err, error_str.data(), std::min(error_str.size(), size_t(1024)));
+    PyGILState_STATE gil_state;
+    gil_state = PyGILState_Ensure();
+    PyErr_SetString(PyExc_RuntimeError, err);
+    std::cerr << "Error during Intel loadBinary: " << err << std::endl;
+    PyGILState_Release(gil_state);
+    return NULL;
+  }
 }
 
 static PyObject *initContext(PyObject *self, PyObject *args) {

From 390eaae6bc8d9f8304c709c176282e5a5e14c22e Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 30 Sep 2024 18:26:41 -0700
Subject: [PATCH 2/5] Return n_regs using grf size build str flag

---
 third_party/intel/backend/driver.c | 90 ++++++++++++++++++++----------
 1 file changed, 61 insertions(+), 29 deletions(-)

diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
index 7b12e03a3f..88c6be9e9c 100644
--- a/third_party/intel/backend/driver.c
+++ b/third_party/intel/backend/driver.c
@@ -102,15 +102,17 @@ void freeKernelBundle(PyObject *p) {
       PyCapsule_GetPointer(p, "kernel_bundle"));
 }
 
+using Spills = int32_t;
+
 template <typename L0_DEVICE, typename L0_CONTEXT>
-std::tuple<ze_module_handle_t, ze_kernel_handle_t, int32_t, int32_t>
+std::tuple<ze_module_handle_t, ze_kernel_handle_t, Spills>
 compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size,
                         const std::string &kernel_name, L0_DEVICE l0_device,
-                        L0_CONTEXT l0_context, const std::string &build_flags,
-                        const bool is_spv) {
+                        L0_CONTEXT l0_context,
+                        const std::string& build_flags, const bool is_spv) {
   auto l0_module =
       checkSyclErrors(create_module(l0_context, l0_device, binary_ptr,
-                                    binary_size, build_flags.c_str(), is_spv));
+                                    binary_size, build_flags.data(), is_spv));
 
   // Retrieve the kernel properties (e.g. register spills).
   auto l0_kernel = checkSyclErrors(create_function(l0_module, kernel_name));
@@ -121,20 +123,58 @@ compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size,
   checkSyclErrors(
       std::make_tuple(NULL, zeKernelGetProperties(l0_kernel, &props)));
 
-  int32_t n_spills = props.spillMemSize;
-  const int32_t n_regs = 0;
+  const int32_t n_spills = props.spillMemSize;
 
-  return std::make_tuple(l0_module, l0_kernel, n_regs, n_spills);
+  return std::make_tuple(l0_module, l0_kernel, n_spills);
 }
 
+struct BuildFlags {
+  std::string build_flags_str;
+
+  const std::string LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"};
+  const std::string SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"};
+  const std::string AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"};
+
+  BuildFlags(const char *build_flags) : build_flags_str(build_flags) {}
+
+  const std::string& operator()() const {
+    return build_flags_str;
+  }
+
+  int32_t n_regs() {
+    if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) {
+      return 256;
+    }
+    if (build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos) {
+      return 128;
+    }
+    // TODO: arguably we could return 128 if we find no flag instead of 0. For
+    // now, stick with the conservative choice and alert the user only if a
+    // specific GRF mode is specified.
+    return 0;
+  }
+
+  const bool hasGRFSizeFlag() {
+    if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos ||
+        build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos ||
+        build_flags_str.find(AUTO_GRF_FLAG) != std::string::npos) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  void addLargeGRFSizeFlag() { build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); }
+};
+
 static PyObject *loadBinary(PyObject *self, PyObject *args) {
-  const char *name, *build_flags;
+  const char *name, *build_flags_ptr;
   int shared;
   PyObject *py_bytes;
   int devId;
 
-  if (!PyArg_ParseTuple(args, "sSisi", &name, &py_bytes, &shared, &build_flags,
-                        &devId)) {
+  if (!PyArg_ParseTuple(args, "sSisi", &name, &py_bytes, &shared,
+                        &build_flags_ptr, &devId)) {
     std::cerr << "loadBinary arg parse failed" << std::endl;
     return NULL;
   }
@@ -144,6 +184,8 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
     return NULL;
   }
 
+  BuildFlags build_flags(build_flags_ptr);
+
   try {
 
     const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId];
@@ -164,24 +206,13 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
         isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE"));
     const bool is_spv = use_native_code ? !(*use_native_code) : true;
 
-    auto [l0_module, l0_kernel, n_regs, n_spills] =
+    auto [l0_module, l0_kernel, n_spills] =
         compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device,
-                                l0_context, build_flags, is_spv);
+                                l0_context, build_flags(), is_spv);
 
     if (is_spv) {
       constexpr int32_t max_reg_spill = 1000;
-      std::string build_flags_str(build_flags);
-      bool is_GRF_mode_specified = false;
-
-      // Check whether the GRF mode is specified by the build flags.
-      if (build_flags_str.find("-cl-intel-256-GRF-per-thread") !=
-              std::string::npos ||
-          build_flags_str.find("-cl-intel-128-GRF-per-thread") !=
-              std::string::npos ||
-          build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") !=
-              std::string::npos) {
-        is_GRF_mode_specified = true;
-      }
+      const bool is_GRF_mode_specified = build_flags.hasGRFSizeFlag();
 
       // If the register mode isn't set, and the number of spills is greater
       // than the threshold, recompile the kernel using large GRF mode.
@@ -193,18 +224,19 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
                     << " spills, recompiling the kernel using large GRF mode"
                     << std::endl;
 
-        const std::string new_build_flags =
-            build_flags_str.append(" -cl-intel-256-GRF-per-thread");
+        build_flags.addLargeGRFSizeFlag();
 
-        auto [l0_module, l0_kernel, n_regs, n_spills] = compileLevelZeroObjects(
+        auto [l0_module, l0_kernel, n_spills] = compileLevelZeroObjects(
             binary_ptr, binary_size, kernel_name, l0_device, l0_context,
-            new_build_flags, is_spv);
-      
+            build_flags(), is_spv);
+
       if (debugEnabled)
         std::cout << "(I): Kernel has now " << n_spills << " spills"
                   << std::endl;
       }
     }
+    
+    auto n_regs = build_flags.n_regs();
 
     auto mod = new sycl::kernel_bundle<sycl::bundle_state::executable>(
         sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,

From d7b0d98524df31aceba1c5fb2b8f96fdc6a65fc9 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 30 Sep 2024 18:49:48 -0700
Subject: [PATCH 3/5] small fixups

---
 third_party/intel/backend/compiler.py |  2 +-
 third_party/intel/backend/driver.c    | 22 +++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 5d1a9fdc75..8ad131d413 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -338,7 +338,7 @@ def make_spv(src, metadata, options):
                     if os.path.exists(flog.name):
                         with open(flog.name) as log_file:
                             log = log_file.read().strip()
-                            if 'spilled' in log:
+                            if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") is -1:
                                 """
                                 The exact message is something like:
                                     warning: kernel matmul_kernel  compiled SIMD16 allocated 128 regs and spilled around 217
diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
index 88c6be9e9c..2796222408 100644
--- a/third_party/intel/backend/driver.c
+++ b/third_party/intel/backend/driver.c
@@ -108,8 +108,8 @@ template <typename L0_DEVICE, typename L0_CONTEXT>
 std::tuple<ze_module_handle_t, ze_kernel_handle_t, Spills>
 compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size,
                         const std::string &kernel_name, L0_DEVICE l0_device,
-                        L0_CONTEXT l0_context,
-                        const std::string& build_flags, const bool is_spv) {
+                        L0_CONTEXT l0_context, const std::string &build_flags,
+                        const bool is_spv) {
   auto l0_module =
       checkSyclErrors(create_module(l0_context, l0_device, binary_ptr,
                                     binary_size, build_flags.data(), is_spv));
@@ -137,9 +137,7 @@ struct BuildFlags {
 
   BuildFlags(const char *build_flags) : build_flags_str(build_flags) {}
 
-  const std::string& operator()() const {
-    return build_flags_str;
-  }
+  const std::string &operator()() const { return build_flags_str; }
 
   int32_t n_regs() {
     if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) {
@@ -164,7 +162,9 @@ struct BuildFlags {
     }
   }
 
-  void addLargeGRFSizeFlag() { build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); }
+  void addLargeGRFSizeFlag() {
+    build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG);
+  }
 };
 
 static PyObject *loadBinary(PyObject *self, PyObject *args) {
@@ -218,7 +218,7 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
       // than the threshold, recompile the kernel using large GRF mode.
       if (!is_GRF_mode_specified && n_spills > max_reg_spill) {
         const std::optional<bool> debugEnabled =
-          isEnvValueBool(getStrEnv("TRITON_DEBUG"));
+            isEnvValueBool(getStrEnv("TRITON_DEBUG"));
         if (debugEnabled)
           std::cout << "(I): Detected " << n_spills
                     << " spills, recompiling the kernel using large GRF mode"
@@ -230,12 +230,12 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) {
             binary_ptr, binary_size, kernel_name, l0_device, l0_context,
             build_flags(), is_spv);
 
-      if (debugEnabled)
-        std::cout << "(I): Kernel has now " << n_spills << " spills"
-                  << std::endl;
+        if (debugEnabled)
+          std::cout << "(I): Kernel has now " << n_spills << " spills"
+                    << std::endl;
       }
     }
-    
+
     auto n_regs = build_flags.n_regs();
 
     auto mod = new sycl::kernel_bundle<sycl::bundle_state::executable>(

From eeedeeb0061ca602596e852eb0d4456203990dea Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 16 Oct 2024 17:54:53 -0700
Subject: [PATCH 4/5] Address review comments

---
 third_party/intel/backend/driver.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c
index 2796222408..56e9ac4149 100644
--- a/third_party/intel/backend/driver.c
+++ b/third_party/intel/backend/driver.c
@@ -31,9 +31,8 @@ static inline T checkSyclErrors(const std::tuple<T, ze_result_t> tuple) {
   const auto code = std::get<1>(tuple);
   if (code != ZE_RESULT_SUCCESS) {
     throw std::runtime_error(parseZeResultCode(code));
-  } else {
-    return std::get<0>(tuple);
   }
+  return std::get<0>(tuple);
 }
 
 static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
@@ -131,15 +130,15 @@ compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size,
 struct BuildFlags {
   std::string build_flags_str;
 
-  const std::string LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"};
-  const std::string SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"};
-  const std::string AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"};
+  const char *LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"};
+  const char *SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"};
+  const char *AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"};
 
   BuildFlags(const char *build_flags) : build_flags_str(build_flags) {}
 
   const std::string &operator()() const { return build_flags_str; }
 
-  int32_t n_regs() {
+  int32_t n_regs() const {
     if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) {
       return 256;
     }
@@ -152,18 +151,18 @@ struct BuildFlags {
     return 0;
   }
 
-  const bool hasGRFSizeFlag() {
+  const bool hasGRFSizeFlag() const {
     if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos ||
         build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos ||
         build_flags_str.find(AUTO_GRF_FLAG) != std::string::npos) {
       return true;
-    } else {
-      return false;
     }
+
+    return false;
   }
 
   void addLargeGRFSizeFlag() {
-    build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG);
+    build_flags_str = build_flags_str.append(" ").append(LARGE_GRF_FLAG);
   }
 };
 

From 6a606b54fa65979037a351c82ff3548b10a9dc78 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 17 Oct 2024 02:03:58 +0000
Subject: [PATCH 5/5] small fixup

---
 third_party/intel/backend/compiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 8ad131d413..da853d0d09 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -338,7 +338,7 @@ def make_spv(src, metadata, options):
                     if os.path.exists(flog.name):
                         with open(flog.name) as log_file:
                             log = log_file.read().strip()
-                            if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") is -1:
+                            if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1:
                                 """
                                 The exact message is something like:
                                     warning: kernel matmul_kernel  compiled SIMD16 allocated 128 regs and spilled around 217