From 897b125d40da5da49e3a9680eecee61037e463f9 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 26 Sep 2024 19:15:42 -0700 Subject: [PATCH 1/5] Refactor loadBinary to support n_regs --- third_party/intel/backend/driver.c | 223 +++++++++++++++-------------- 1 file changed, 114 insertions(+), 109 deletions(-) diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c index aaabd95fc6..7b12e03a3f 100644 --- a/third_party/intel/backend/driver.c +++ b/third_party/intel/backend/driver.c @@ -26,25 +26,14 @@ static std::vector g_devices; static std::vector> g_sycl_l0_device_list; -static inline void gpuAssert(ze_result_t code) { - if (code != ZE_RESULT_SUCCESS) { - auto str = parseZeResultCode(code); - char err[1024] = {0}; - strncat(err, str.c_str(), std::min(str.size(), size_t(1024))); - PyGILState_STATE gil_state; - gil_state = PyGILState_Ensure(); - PyErr_SetString(PyExc_RuntimeError, err); - PyGILState_Release(gil_state); - } -} - template static inline T checkSyclErrors(const std::tuple tuple) { - gpuAssert(std::get<1>(tuple)); - if (PyErr_Occurred()) - return nullptr; - else + const auto code = std::get<1>(tuple); + if (code != ZE_RESULT_SUCCESS) { + throw std::runtime_error(parseZeResultCode(code)); + } else { return std::get<0>(tuple); + } } static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { @@ -113,6 +102,31 @@ void freeKernelBundle(PyObject *p) { PyCapsule_GetPointer(p, "kernel_bundle")); } +template +std::tuple +compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size, + const std::string &kernel_name, L0_DEVICE l0_device, + L0_CONTEXT l0_context, const std::string &build_flags, + const bool is_spv) { + auto l0_module = + checkSyclErrors(create_module(l0_context, l0_device, binary_ptr, + binary_size, build_flags.c_str(), is_spv)); + + // Retrieve the kernel properties (e.g. register spills). + auto l0_kernel = checkSyclErrors(create_function(l0_module, kernel_name)); + + ze_kernel_properties_t props; + props.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES; + props.pNext = nullptr; + checkSyclErrors( + std::make_tuple(NULL, zeKernelGetProperties(l0_kernel, &props))); + + int32_t n_spills = props.spillMemSize; + const int32_t n_regs = 0; + + return std::make_tuple(l0_module, l0_kernel, n_regs, n_spills); +} + static PyObject *loadBinary(PyObject *self, PyObject *args) { const char *name, *build_flags; int shared; @@ -130,106 +144,97 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { return NULL; } - const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId]; - const sycl::device sycl_device = sycl_l0_device_pair.first; - - std::string kernel_name = name; - const size_t binary_size = PyBytes_Size(py_bytes); - - uint8_t *binary_ptr = (uint8_t *)PyBytes_AsString(py_bytes); - const auto ctx = sycl_device.get_platform().ext_oneapi_get_default_context(); - const auto l0_device = - sycl::get_native(sycl_device); - const auto l0_context = - sycl::get_native(ctx); - - const auto use_native_code = - isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE")); - const bool is_spv = use_native_code ? !(*use_native_code) : true; - - auto l0_module = checkSyclErrors(create_module( - l0_context, l0_device, binary_ptr, binary_size, build_flags, is_spv)); - - auto checkL0Errors = [&](auto l0_module) -> ze_kernel_handle_t { - if (PyErr_Occurred()) { - // check for errors from module creation - return NULL; - } - ze_kernel_handle_t l0_kernel = - checkSyclErrors(create_function(l0_module, kernel_name)); - if (PyErr_Occurred()) { - // check for errors from kernel creation - return NULL; - } - return l0_kernel; - }; - - // Retrieve the kernel properties (e.g. register spills). - ze_kernel_handle_t l0_kernel = checkL0Errors(l0_module); - ze_kernel_properties_t props; - props.stype = ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES; - props.pNext = nullptr; - gpuAssert(zeKernelGetProperties(l0_kernel, &props)); - - int32_t n_spills = props.spillMemSize; - const int32_t n_regs = 0; - - if (is_spv) { - constexpr int32_t max_reg_spill = 1000; - std::string build_flags_str(build_flags); - bool is_GRF_mode_specified = false; - - // Check whether the GRF mode is specified by the build flags. - if (build_flags_str.find("-cl-intel-256-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-128-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") != - std::string::npos) { - is_GRF_mode_specified = true; - } + try { + + const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId]; + const sycl::device sycl_device = sycl_l0_device_pair.first; + + const std::string kernel_name = name; + const size_t binary_size = PyBytes_Size(py_bytes); + + uint8_t *binary_ptr = (uint8_t *)PyBytes_AsString(py_bytes); + const auto ctx = + sycl_device.get_platform().ext_oneapi_get_default_context(); + const auto l0_device = + sycl::get_native(sycl_device); + const auto l0_context = + sycl::get_native(ctx); + + const auto use_native_code = + isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE")); + const bool is_spv = use_native_code ? !(*use_native_code) : true; + + auto [l0_module, l0_kernel, n_regs, n_spills] = + compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device, + l0_context, build_flags, is_spv); + + if (is_spv) { + constexpr int32_t max_reg_spill = 1000; + std::string build_flags_str(build_flags); + bool is_GRF_mode_specified = false; + + // Check whether the GRF mode is specified by the build flags. + if (build_flags_str.find("-cl-intel-256-GRF-per-thread") != + std::string::npos || + build_flags_str.find("-cl-intel-128-GRF-per-thread") != + std::string::npos || + build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") != + std::string::npos) { + is_GRF_mode_specified = true; + } - // If the register mode isn't set, and the number of spills is greater - // than the threshold, recompile the kernel using large GRF mode. - if (!is_GRF_mode_specified && n_spills > max_reg_spill) { - const std::optional debugEnabled = + // If the register mode isn't set, and the number of spills is greater + // than the threshold, recompile the kernel using large GRF mode. + if (!is_GRF_mode_specified && n_spills > max_reg_spill) { + const std::optional debugEnabled = isEnvValueBool(getStrEnv("TRITON_DEBUG")); - if (debugEnabled) - std::cout << "(I): Detected " << n_spills - << " spills, recompiling kernel \"" << kernel_name - << "\" using large GRF mode" << std::endl; - - const std::string new_build_flags = - build_flags_str.append(" -cl-intel-256-GRF-per-thread"); - l0_module = checkSyclErrors( - create_module(l0_context, l0_device, binary_ptr, binary_size, - new_build_flags.c_str(), is_spv)); - - l0_kernel = checkL0Errors(l0_module); - gpuAssert(zeKernelGetProperties(l0_kernel, &props)); - n_spills = props.spillMemSize; - + if (debugEnabled) + std::cout << "(I): Detected " << n_spills + << " spills, recompiling the kernel using large GRF mode" + << std::endl; + + const std::string new_build_flags = + build_flags_str.append(" -cl-intel-256-GRF-per-thread"); + + auto [l0_module, l0_kernel, n_regs, n_spills] = compileLevelZeroObjects( + binary_ptr, binary_size, kernel_name, l0_device, l0_context, + new_build_flags, is_spv); + if (debugEnabled) std::cout << "(I): Kernel has now " << n_spills << " spills" << std::endl; + } } - } - auto mod = new sycl::kernel_bundle( - sycl::make_kernel_bundle( - {l0_module, sycl::ext::oneapi::level_zero::ownership::transfer}, - ctx)); - sycl::kernel *fun = - new sycl::kernel(sycl::make_kernel( - {*mod, l0_kernel, sycl::ext::oneapi::level_zero::ownership::transfer}, - ctx)); - auto kernel_py = - PyCapsule_New(reinterpret_cast(fun), "kernel", freeKernel); - auto kernel_bundle_py = PyCapsule_New(reinterpret_cast(mod), - "kernel_bundle", freeKernelBundle); - - return Py_BuildValue("(OOii)", kernel_bundle_py, kernel_py, n_regs, n_spills); + auto mod = new sycl::kernel_bundle( + sycl::make_kernel_bundle( + {l0_module, sycl::ext::oneapi::level_zero::ownership::transfer}, + ctx)); + sycl::kernel *fun = new sycl::kernel( + sycl::make_kernel( + {*mod, l0_kernel, + sycl::ext::oneapi::level_zero::ownership::transfer}, + ctx)); + auto kernel_py = + PyCapsule_New(reinterpret_cast(fun), "kernel", freeKernel); + auto kernel_bundle_py = PyCapsule_New(reinterpret_cast(mod), + "kernel_bundle", freeKernelBundle); + + return Py_BuildValue("(OOii)", kernel_bundle_py, kernel_py, n_regs, + n_spills); + + } catch (const std::exception &e) { + char err[1024] = {0}; + std::string_view error_str(e.what()); + strncat(err, error_str.data(), std::min(error_str.size(), size_t(1024))); + PyGILState_STATE gil_state; + gil_state = PyGILState_Ensure(); + PyErr_SetString(PyExc_RuntimeError, err); + std::cerr << "Error during Intel loadBinary: " << err << std::endl; + PyGILState_Release(gil_state); + return NULL; + } } static PyObject *initContext(PyObject *self, PyObject *args) { From 390eaae6bc8d9f8304c709c176282e5a5e14c22e Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 30 Sep 2024 18:26:41 -0700 Subject: [PATCH 2/5] Return n_regs using grf size build str flag --- third_party/intel/backend/driver.c | 90 ++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c index 7b12e03a3f..88c6be9e9c 100644 --- a/third_party/intel/backend/driver.c +++ b/third_party/intel/backend/driver.c @@ -102,15 +102,17 @@ void freeKernelBundle(PyObject *p) { PyCapsule_GetPointer(p, "kernel_bundle")); } +using Spills = int32_t; + template -std::tuple +std::tuple compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size, const std::string &kernel_name, L0_DEVICE l0_device, - L0_CONTEXT l0_context, const std::string &build_flags, - const bool is_spv) { + L0_CONTEXT l0_context, + const std::string& build_flags, const bool is_spv) { auto l0_module = checkSyclErrors(create_module(l0_context, l0_device, binary_ptr, - binary_size, build_flags.c_str(), is_spv)); + binary_size, build_flags.data(), is_spv)); // Retrieve the kernel properties (e.g. register spills). auto l0_kernel = checkSyclErrors(create_function(l0_module, kernel_name)); @@ -121,20 +123,58 @@ compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size, checkSyclErrors( std::make_tuple(NULL, zeKernelGetProperties(l0_kernel, &props))); - int32_t n_spills = props.spillMemSize; - const int32_t n_regs = 0; + const int32_t n_spills = props.spillMemSize; - return std::make_tuple(l0_module, l0_kernel, n_regs, n_spills); + return std::make_tuple(l0_module, l0_kernel, n_spills); } +struct BuildFlags { + std::string build_flags_str; + + const std::string LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"}; + const std::string SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"}; + const std::string AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"}; + + BuildFlags(const char *build_flags) : build_flags_str(build_flags) {} + + const std::string& operator()() const { + return build_flags_str; + } + + int32_t n_regs() { + if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) { + return 256; + } + if (build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos) { + return 128; + } + // TODO: arguably we could return 128 if we find no flag instead of 0. For + // now, stick with the conservative choice and alert the user only if a + // specific GRF mode is specified. + return 0; + } + + const bool hasGRFSizeFlag() { + if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos || + build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos || + build_flags_str.find(AUTO_GRF_FLAG) != std::string::npos) { + return true; + } else { + return false; + } + } + + void addLargeGRFSizeFlag() { build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); } +}; + static PyObject *loadBinary(PyObject *self, PyObject *args) { - const char *name, *build_flags; + const char *name, *build_flags_ptr; int shared; PyObject *py_bytes; int devId; - if (!PyArg_ParseTuple(args, "sSisi", &name, &py_bytes, &shared, &build_flags, - &devId)) { + if (!PyArg_ParseTuple(args, "sSisi", &name, &py_bytes, &shared, + &build_flags_ptr, &devId)) { std::cerr << "loadBinary arg parse failed" << std::endl; return NULL; } @@ -144,6 +184,8 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { return NULL; } + BuildFlags build_flags(build_flags_ptr); + try { const auto &sycl_l0_device_pair = g_sycl_l0_device_list[devId]; @@ -164,24 +206,13 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { isEnvValueBool(getStrEnv("TRITON_XPU_GEN_NATIVE_CODE")); const bool is_spv = use_native_code ? !(*use_native_code) : true; - auto [l0_module, l0_kernel, n_regs, n_spills] = + auto [l0_module, l0_kernel, n_spills] = compileLevelZeroObjects(binary_ptr, binary_size, kernel_name, l0_device, - l0_context, build_flags, is_spv); + l0_context, build_flags(), is_spv); if (is_spv) { constexpr int32_t max_reg_spill = 1000; - std::string build_flags_str(build_flags); - bool is_GRF_mode_specified = false; - - // Check whether the GRF mode is specified by the build flags. - if (build_flags_str.find("-cl-intel-256-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-128-GRF-per-thread") != - std::string::npos || - build_flags_str.find("-cl-intel-enable-auto-large-GRF-mode") != - std::string::npos) { - is_GRF_mode_specified = true; - } + const bool is_GRF_mode_specified = build_flags.hasGRFSizeFlag(); // If the register mode isn't set, and the number of spills is greater // than the threshold, recompile the kernel using large GRF mode. @@ -193,18 +224,19 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { << " spills, recompiling the kernel using large GRF mode" << std::endl; - const std::string new_build_flags = - build_flags_str.append(" -cl-intel-256-GRF-per-thread"); + build_flags.addLargeGRFSizeFlag(); - auto [l0_module, l0_kernel, n_regs, n_spills] = compileLevelZeroObjects( + auto [l0_module, l0_kernel, n_spills] = compileLevelZeroObjects( binary_ptr, binary_size, kernel_name, l0_device, l0_context, - new_build_flags, is_spv); - + build_flags(), is_spv); + if (debugEnabled) std::cout << "(I): Kernel has now " << n_spills << " spills" << std::endl; } } + + auto n_regs = build_flags.n_regs(); auto mod = new sycl::kernel_bundle( sycl::make_kernel_bundle Date: Mon, 30 Sep 2024 18:49:48 -0700 Subject: [PATCH 3/5] small fixups --- third_party/intel/backend/compiler.py | 2 +- third_party/intel/backend/driver.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 5d1a9fdc75..8ad131d413 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -338,7 +338,7 @@ def make_spv(src, metadata, options): if os.path.exists(flog.name): with open(flog.name) as log_file: log = log_file.read().strip() - if 'spilled' in log: + if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") is -1: """ The exact message is something like: warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217 diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c index 88c6be9e9c..2796222408 100644 --- a/third_party/intel/backend/driver.c +++ b/third_party/intel/backend/driver.c @@ -108,8 +108,8 @@ template std::tuple compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size, const std::string &kernel_name, L0_DEVICE l0_device, - L0_CONTEXT l0_context, - const std::string& build_flags, const bool is_spv) { + L0_CONTEXT l0_context, const std::string &build_flags, + const bool is_spv) { auto l0_module = checkSyclErrors(create_module(l0_context, l0_device, binary_ptr, binary_size, build_flags.data(), is_spv)); @@ -137,9 +137,7 @@ struct BuildFlags { BuildFlags(const char *build_flags) : build_flags_str(build_flags) {} - const std::string& operator()() const { - return build_flags_str; - } + const std::string &operator()() const { return build_flags_str; } int32_t n_regs() { if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) { @@ -164,7 +162,9 @@ struct BuildFlags { } } - void addLargeGRFSizeFlag() { build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); } + void addLargeGRFSizeFlag() { + build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); + } }; static PyObject *loadBinary(PyObject *self, PyObject *args) { @@ -218,7 +218,7 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { // than the threshold, recompile the kernel using large GRF mode. if (!is_GRF_mode_specified && n_spills > max_reg_spill) { const std::optional debugEnabled = - isEnvValueBool(getStrEnv("TRITON_DEBUG")); + isEnvValueBool(getStrEnv("TRITON_DEBUG")); if (debugEnabled) std::cout << "(I): Detected " << n_spills << " spills, recompiling the kernel using large GRF mode" @@ -230,12 +230,12 @@ static PyObject *loadBinary(PyObject *self, PyObject *args) { binary_ptr, binary_size, kernel_name, l0_device, l0_context, build_flags(), is_spv); - if (debugEnabled) - std::cout << "(I): Kernel has now " << n_spills << " spills" - << std::endl; + if (debugEnabled) + std::cout << "(I): Kernel has now " << n_spills << " spills" + << std::endl; } } - + auto n_regs = build_flags.n_regs(); auto mod = new sycl::kernel_bundle( From eeedeeb0061ca602596e852eb0d4456203990dea Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 16 Oct 2024 17:54:53 -0700 Subject: [PATCH 4/5] Address review comments --- third_party/intel/backend/driver.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/third_party/intel/backend/driver.c b/third_party/intel/backend/driver.c index 2796222408..56e9ac4149 100644 --- a/third_party/intel/backend/driver.c +++ b/third_party/intel/backend/driver.c @@ -31,9 +31,8 @@ static inline T checkSyclErrors(const std::tuple tuple) { const auto code = std::get<1>(tuple); if (code != ZE_RESULT_SUCCESS) { throw std::runtime_error(parseZeResultCode(code)); - } else { - return std::get<0>(tuple); } + return std::get<0>(tuple); } static PyObject *getDeviceProperties(PyObject *self, PyObject *args) { @@ -131,15 +130,15 @@ compileLevelZeroObjects(uint8_t *binary_ptr, const size_t binary_size, struct BuildFlags { std::string build_flags_str; - const std::string LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"}; - const std::string SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"}; - const std::string AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"}; + const char *LARGE_GRF_FLAG{"-cl-intel-256-GRF-per-thread"}; + const char *SMALL_GRF_FLAG{"-cl-intel-128-GRF-per-thread"}; + const char *AUTO_GRF_FLAG{"-cl-intel-enable-auto-large-GRF-mode"}; BuildFlags(const char *build_flags) : build_flags_str(build_flags) {} const std::string &operator()() const { return build_flags_str; } - int32_t n_regs() { + int32_t n_regs() const { if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos) { return 256; } @@ -152,18 +151,18 @@ struct BuildFlags { return 0; } - const bool hasGRFSizeFlag() { + const bool hasGRFSizeFlag() const { if (build_flags_str.find(LARGE_GRF_FLAG) != std::string::npos || build_flags_str.find(SMALL_GRF_FLAG) != std::string::npos || build_flags_str.find(AUTO_GRF_FLAG) != std::string::npos) { return true; - } else { - return false; } + + return false; } void addLargeGRFSizeFlag() { - build_flags_str = build_flags_str.append(" " + LARGE_GRF_FLAG); + build_flags_str = build_flags_str.append(" ").append(LARGE_GRF_FLAG); } }; From 6a606b54fa65979037a351c82ff3548b10a9dc78 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 17 Oct 2024 02:03:58 +0000 Subject: [PATCH 5/5] small fixup --- third_party/intel/backend/compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 8ad131d413..da853d0d09 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -338,7 +338,7 @@ def make_spv(src, metadata, options): if os.path.exists(flog.name): with open(flog.name) as log_file: log = log_file.read().strip() - if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") is -1: + if 'spilled' in log and metadata["build_flags"].find("-cl-intel-256-GRF-per-thread") == -1: """ The exact message is something like: warning: kernel matmul_kernel compiled SIMD16 allocated 128 regs and spilled around 217