Skip to content

Commit f674b76

Browse files
authored
[PROTON][XPU] Remove l0 build-time dependency (#5369)
Now code that depends on level-zero will be compiled at Triton runtime using `compile_module_from_src`. --------- Signed-off-by: Anatoly Myachev <[email protected]>
1 parent 7979638 commit f674b76

File tree

6 files changed

+110
-111
lines changed

6 files changed

+110
-111
lines changed

third_party/intel/backend/proton_utils.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <cstring>
2+
#include <level_zero/ze_api.h>
13
#include <sycl/sycl.hpp>
24

35
extern "C" void waitOnSyclQueue(void *syclQueue) {
@@ -34,3 +36,74 @@ enumDeviceUUIDs(std::vector<std::array<uint8_t, 16>> deviceUUIDs_) {
3436
}
3537
}
3638
}
39+
40+
namespace proton {
41+
42+
namespace xpu {
43+
44+
void check(ze_result_t ret, const char *functionName) {
45+
if (ret != ZE_RESULT_SUCCESS) {
46+
throw std::runtime_error("Failed to execute " + std::string(functionName) +
47+
" with error " + std::to_string(ret));
48+
}
49+
}
50+
51+
// FIXME: for this initialization is needed
52+
// ref: initDevices
53+
// static std::vector<std::pair<sycl::device, ze_device_handle_t>>
54+
// g_sycl_l0_device_list;
55+
56+
// FIXME: rewrite with
57+
// sycl::device.get_info<sycl::ext::intel::info::device::architecture>; cache
58+
// the result
59+
extern "C" void getDeviceProperties(uint64_t index, uint32_t *clockRate,
60+
uint32_t *memoryClockRate,
61+
uint32_t *busWidth, uint32_t *numSms,
62+
char arch[256]) {
63+
// ref: getDeviceProperties
64+
65+
// FIXME: double check that initialization is needed
66+
// At the very least, it shouldn't be for every call
67+
check(zeInit(ZE_INIT_FLAG_GPU_ONLY), "zeInit");
68+
69+
// FIXME: For now I use the naive approach that the device index from PTI
70+
// record coincides with the default numbering of all devices
71+
uint32_t driverCount = 1;
72+
ze_driver_handle_t driverHandle;
73+
check(zeDriverGet(&driverCount, &driverHandle), "zeDriverGet");
74+
uint32_t deviceCount = 1;
75+
// Get device handle
76+
ze_device_handle_t phDevice;
77+
check(zeDeviceGet(driverHandle, &deviceCount, &phDevice), "zeDeviceGet");
78+
// create a struct to hold device properties
79+
ze_device_properties_t device_properties = {};
80+
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
81+
check(zeDeviceGetProperties(phDevice, &device_properties),
82+
"zeDeviceGetProperties");
83+
*clockRate = device_properties.coreClockRate;
84+
*numSms =
85+
device_properties.numSlices * device_properties.numSubslicesPerSlice;
86+
// create a struct to hold device memory properties
87+
uint32_t memoryCount = 0;
88+
check(zeDeviceGetMemoryProperties(phDevice, &memoryCount, nullptr),
89+
"zeDeviceGetMemoryProperties");
90+
auto pMemoryProperties = new ze_device_memory_properties_t[memoryCount];
91+
for (uint32_t mem = 0; mem < memoryCount; ++mem) {
92+
pMemoryProperties[mem].stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
93+
pMemoryProperties[mem].pNext = nullptr;
94+
}
95+
check(zeDeviceGetMemoryProperties(phDevice, &memoryCount, pMemoryProperties),
96+
"zeDeviceGetMemoryProperties");
97+
98+
*memoryClockRate = pMemoryProperties[0].maxClockRate;
99+
*busWidth = pMemoryProperties[0].maxBusWidth;
100+
101+
delete[] pMemoryProperties;
102+
103+
// FIXME: there should be architecture, but not a name
104+
memcpy(arch, device_properties.name, 256);
105+
}
106+
107+
} // namespace xpu
108+
109+
} // namespace proton

third_party/proton/csrc/include/Driver/GPU/XpuApi.h

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,12 @@
22
#define PROTON_DRIVER_GPU_SYCL_H_
33

44
#include "Device.h"
5-
#include <level_zero/ze_api.h>
65

76
namespace proton {
87

98
namespace xpu {
109

11-
template <bool CheckSuccess> ze_result_t init(ze_init_flags_t flags);
12-
13-
template <bool CheckSuccess>
14-
ze_result_t driverGet(uint32_t *pCount, ze_driver_handle_t *phDrivers);
15-
16-
template <bool CheckSuccess>
17-
ze_result_t deviceGet(ze_driver_handle_t hDriver, uint32_t *pCount,
18-
ze_device_handle_t *phDevices);
19-
20-
template <bool CheckSuccess>
21-
ze_result_t deviceGetProperties(ze_device_handle_t hDevice,
22-
ze_device_properties_t *pDeviceProperties);
23-
24-
template <bool CheckSuccess>
25-
ze_result_t
26-
deviceGetMemoryProperties(ze_device_handle_t hDevice, uint32_t *pCount,
27-
ze_device_memory_properties_t *pMemProperties);
10+
extern std::string PROTON_UTILS;
2811

2912
Device getDevice(uint64_t index);
3013

third_party/proton/csrc/include/Profiler/GPUProfiler.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@ class GPUProfiler : public Profiler,
3737
return dynamic_cast<ConcreteProfilerT &>(*this);
3838
}
3939

40-
ConcreteProfilerT &setUtilsCachePath(const std::string &utils_cache_path) {
41-
this->utils_cache_path = utils_cache_path;
42-
return dynamic_cast<ConcreteProfilerT &>(*this);
43-
}
44-
4540
protected:
4641
// OpInterface
4742
void startOp(const Scope &scope) override {
@@ -143,7 +138,6 @@ class GPUProfiler : public Profiler,
143138
static thread_local ThreadState threadState;
144139
Correlation correlation;
145140
void *syclQueue;
146-
std::string utils_cache_path;
147141

148142
// Use the pimpl idiom to hide the implementation details. This lets us avoid
149143
// including the cupti header from this header. The cupti header and the

third_party/proton/csrc/lib/Driver/GPU/XpuApi.cpp

Lines changed: 27 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,95 +1,44 @@
11
#include "Driver/GPU/XpuApi.h"
2-
#include "Driver/Dispatch.h"
32

4-
#include <level_zero/ze_api.h>
3+
#include <dlfcn.h>
4+
#include <stdexcept>
55
#include <string>
66

77
namespace proton {
88

99
namespace xpu {
1010

11-
struct ExternLibLevelZero : public ExternLibBase {
12-
using RetType = ze_result_t;
11+
std::string PROTON_UTILS;
1312

14-
// FIXME: removeme `/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so.1`
15-
static constexpr const char *name = "libze_intel_gpu.so.1";
16-
static constexpr const char *defaultDir = "";
17-
static constexpr RetType success = ZE_RESULT_SUCCESS;
18-
static void *lib;
19-
};
13+
typedef void (*GetDevicePropertiesFunc)(uint64_t, uint32_t *, uint32_t *,
14+
uint32_t *, uint32_t *, char[256]);
2015

21-
void *ExternLibLevelZero::lib = nullptr;
22-
23-
// https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zeinit
24-
DEFINE_DISPATCH(ExternLibLevelZero, init, zeInit, ze_init_flags_t)
25-
// https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zedriverget
26-
DEFINE_DISPATCH(ExternLibLevelZero, driverGet, zeDriverGet, uint32_t *,
27-
ze_driver_handle_t *)
28-
// https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zedeviceget
29-
DEFINE_DISPATCH(ExternLibLevelZero, deviceGet, zeDeviceGet, ze_driver_handle_t,
30-
uint32_t *, ze_device_handle_t *)
31-
// https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zedevicegetproperties
32-
DEFINE_DISPATCH(ExternLibLevelZero, deviceGetProperties, zeDeviceGetProperties,
33-
ze_device_handle_t, ze_device_properties_t *)
34-
// https://oneapi-src.github.io/level-zero-spec/level-zero/latest/core/api.html#zedevicegetmemoryproperties
35-
DEFINE_DISPATCH(ExternLibLevelZero, deviceGetMemoryProperties,
36-
zeDeviceGetMemoryProperties, ze_device_handle_t, uint32_t *,
37-
ze_device_memory_properties_t *)
38-
39-
// FIXME: for this initialization is needed
40-
// ref: initDevices
41-
// static std::vector<std::pair<sycl::device, ze_device_handle_t>>
42-
// g_sycl_l0_device_list;
43-
44-
// FIXME: rewrite with
45-
// sycl::device.get_info<sycl::ext::intel::info::device::architecture>; cache
46-
// the result
4716
Device getDevice(uint64_t index) {
48-
// ref: getDeviceProperties
49-
50-
// FIXME: double check that initialization is needed
51-
// At the very least, it shouldn't be for every call
52-
xpu::init<true>(ZE_INIT_FLAG_GPU_ONLY);
53-
54-
// FIXME: For now I use the naive approach that the device index from PTI
55-
// record coincides with the default numbering of all devices
56-
57-
uint32_t driverCount = 1;
58-
ze_driver_handle_t driverHandle;
59-
xpu::driverGet<true>(&driverCount, &driverHandle);
60-
uint32_t deviceCount = 1;
61-
62-
// Get device handle
63-
ze_device_handle_t phDevice;
64-
xpu::deviceGet<true>(driverHandle, &deviceCount, &phDevice);
65-
66-
// create a struct to hold device properties
67-
ze_device_properties_t device_properties = {};
68-
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
69-
xpu::deviceGetProperties<true>(phDevice, &device_properties);
70-
71-
uint32_t clockRate = device_properties.coreClockRate;
72-
uint32_t numSms =
73-
device_properties.numSlices * device_properties.numSubslicesPerSlice;
74-
75-
// create a struct to hold device memory properties
76-
uint32_t memoryCount = 0;
77-
xpu::deviceGetMemoryProperties<true>(phDevice, &memoryCount, nullptr);
78-
auto pMemoryProperties = new ze_device_memory_properties_t[memoryCount];
79-
for (uint32_t mem = 0; mem < memoryCount; ++mem) {
80-
pMemoryProperties[mem].stype = ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES;
81-
pMemoryProperties[mem].pNext = nullptr;
17+
void *handle = dlopen(PROTON_UTILS.data(), RTLD_LAZY);
18+
if (!handle) {
19+
const char *dlopen_error = dlerror();
20+
throw std::runtime_error(std::string("Failed to load library: ") +
21+
std::string(dlopen_error));
8222
}
83-
xpu::deviceGetMemoryProperties<true>(phDevice, &memoryCount,
84-
pMemoryProperties);
85-
86-
int memoryClockRate = pMemoryProperties[0].maxClockRate;
87-
int busWidth = pMemoryProperties[0].maxBusWidth;
8823

89-
delete[] pMemoryProperties;
24+
dlerror();
25+
GetDevicePropertiesFunc getDeviceProperties =
26+
(GetDevicePropertiesFunc)dlsym(handle, "getDeviceProperties");
27+
const char *dlsym_error = dlerror();
28+
if (dlsym_error) {
29+
dlclose(handle);
30+
throw std::runtime_error(std::string("Failed to load function: ") +
31+
std::string(dlsym_error));
32+
}
9033

91-
// FIXME: there should be architecture, but not a name
92-
std::string arch = device_properties.name;
34+
uint32_t clockRate = 0;
35+
uint32_t memoryClockRate = 0;
36+
uint32_t busWidth = 0;
37+
uint32_t numSms = 0;
38+
char arch[256];
39+
getDeviceProperties(index, &clockRate, &memoryClockRate, &busWidth, &numSms,
40+
arch);
41+
dlclose(handle);
9342

9443
return Device(DeviceType::XPU, index, clockRate, memoryClockRate, busWidth,
9544
numSms, arch);

third_party/proton/csrc/lib/Profiler/Xpupti/XpuptiProfiler.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ void CallbackCommon(pti_callback_domain domain,
278278
typedef void (*EnumDeviceUUIDsFunc)(std::vector<std::array<uint8_t, 16>>);
279279

280280
int callEnumDeviceUUIDs(const std::string &utils_cache_path) {
281-
void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY);
281+
void *handle = dlopen(xpu::PROTON_UTILS.data(), RTLD_LAZY);
282282
if (!handle) {
283283
std::cerr << "Failed to load library: " << dlerror() << std::endl;
284284
return 1;
@@ -302,8 +302,8 @@ int callEnumDeviceUUIDs(const std::string &utils_cache_path) {
302302

303303
typedef void (*WaitOnSyclQueueFunc)(void *);
304304

305-
int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
306-
void *handle = dlopen(utils_cache_path.data(), RTLD_LAZY);
305+
int callWaitOnSyclQueue(void *syclQueue) {
306+
void *handle = dlopen(xpu::PROTON_UTILS.data(), RTLD_LAZY);
307307
if (!handle) {
308308
std::cerr << "Failed to load library: " << dlerror() << std::endl;
309309
return 1;
@@ -328,8 +328,8 @@ int callWaitOnSyclQueue(const std::string &utils_cache_path, void *syclQueue) {
328328
void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
329329
// should be call to shared lib
330330
XpuptiProfiler &profiler = threadState.profiler;
331-
if (profiler.utils_cache_path != "") {
332-
callEnumDeviceUUIDs(profiler.utils_cache_path);
331+
if (xpu::PROTON_UTILS != "") {
332+
callEnumDeviceUUIDs(xpu::PROTON_UTILS);
333333
}
334334

335335
xpupti::viewSetCallbacks<true>(allocBuffer, completeBuffer);
@@ -349,7 +349,7 @@ void XpuptiProfiler::XpuptiProfilerPimpl::doStart() {
349349
void XpuptiProfiler::XpuptiProfilerPimpl::doFlush() {
350350
XpuptiProfiler &profiler = threadState.profiler;
351351
if (profiler.syclQueue != nullptr) {
352-
callWaitOnSyclQueue(profiler.utils_cache_path, profiler.syclQueue);
352+
callWaitOnSyclQueue(profiler.syclQueue);
353353
}
354354

355355
profiler.correlation.flush(

third_party/proton/csrc/lib/Session/Session.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "Profiler/Instrumentation/InstrumentationProfiler.h"
88
#include "Profiler/Roctracer/RoctracerProfiler.h"
99
#ifdef TRITON_BUILD_PROTON_XPU
10+
#include "Driver/GPU/XpuApi.h"
1011
#include "Profiler/Xpupti/XpuptiProfiler.h"
1112
#endif
1213
#include "Utility/String.h"
@@ -26,9 +27,8 @@ Profiler *makeProfiler(const std::string &name, void *sycl_queue = nullptr,
2627
}
2728
#ifdef TRITON_BUILD_PROTON_XPU
2829
if (proton::toLower(name) == "xpupti") {
29-
return &XpuptiProfiler::instance()
30-
.setSyclQueue(sycl_queue)
31-
.setUtilsCachePath(utils_cache_path);
30+
xpu::PROTON_UTILS = utils_cache_path;
31+
return &XpuptiProfiler::instance().setSyclQueue(sycl_queue);
3232
}
3333
#endif
3434
throw std::runtime_error("Unknown profiler: " + name);

0 commit comments

Comments
 (0)