Skip to content

Commit 864e5e7

Browse files
committed
enable profiling for level-zero runtime leveraging level-zero timestamp events
1 parent c04dae2 commit 864e5e7

File tree

2 files changed

+189
-29
lines changed

2 files changed

+189
-29
lines changed

lib/ExecutionEngine/LEVELZERORUNTIME/LevelZeroRuntimeWrappers.cpp

Lines changed: 157 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@
1616
#include <array>
1717
#include <atomic>
1818
#include <cassert>
19+
#include <cfloat>
1920
#include <cstdint>
2021
#include <cstdio>
2122
#include <cstdlib>
23+
#include <memory>
2224
#include <stdexcept>
2325
#include <tuple>
2426
#include <vector>
@@ -109,6 +111,92 @@ getDriverAndDevice(ze_device_type_t deviceType = ZE_DEVICE_TYPE_GPU) {
109111
throw std::runtime_error("getDevice failed");
110112
}
111113

114+
#define _IMEX_PROFILING_TRAITS_SPEC(Desc) \
115+
struct Desc {};
116+
117+
namespace imex {
118+
namespace profiling {
119+
// defining two types representing kernel start and kernel end
120+
_IMEX_PROFILING_TRAITS_SPEC(command_start);
121+
_IMEX_PROFILING_TRAITS_SPEC(command_end);
122+
} // namespace profiling
123+
} // namespace imex
124+
125+
// A Timestamp event pool management class. It currently simply represents
126+
// a event pool with fixed 256 slots. Currently for each run we just need
127+
// one timing event, but we definity need a sophisticated event system in
128+
// the future for programs with multiple kernels.
129+
struct EventPool {
130+
ze_event_pool_handle_t zeEventPool;
131+
132+
EventPool(ze_context_handle_t zeContext_) {
133+
ze_event_pool_desc_t tsEventPoolDesc = {
134+
ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr,
135+
ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP, 256};
136+
CHECK_ZE_RESULT(zeEventPoolCreate(zeContext_, &tsEventPoolDesc, 0, nullptr,
137+
&zeEventPool));
138+
}
139+
140+
~EventPool() { CHECK_ZE_RESULT(zeEventPoolDestroy(zeEventPool)); }
141+
};
142+
143+
// A wrapper to ze_event_handle_t providing timestamp queries
144+
class Event {
145+
private:
146+
uint64_t zeTimestampMaxValue_;
147+
uint64_t zeTimerResolution_;
148+
149+
public:
150+
ze_event_handle_t zeEvent;
151+
152+
Event(ze_context_handle_t zeContext_, ze_device_handle_t zeDevice_) {
153+
static EventPool pool(zeContext_);
154+
155+
// timestamp and timer resolution is a device properties.
156+
// They are required to compute the final wall time.
157+
ze_device_properties_t deviceProperties{};
158+
deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
159+
CHECK_ZE_RESULT(zeDeviceGetProperties(zeDevice_, &deviceProperties));
160+
zeTimestampMaxValue_ =
161+
((1ULL << deviceProperties.kernelTimestampValidBits) - 1ULL);
162+
zeTimerResolution_ = deviceProperties.timerResolution;
163+
164+
ze_event_desc_t eventDesc = {
165+
ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
166+
0, // index
167+
0, // no additional memory/cache coherency required on signal
168+
0 // no additional memory/cache coherency required on wait
169+
};
170+
CHECK_ZE_RESULT(zeEventCreate(pool.zeEventPool, &eventDesc, &zeEvent));
171+
}
172+
173+
// query the kernel start or end (specified via Param) timestamp
174+
template <typename Param> uint64_t get_profiling_info() {
175+
ze_kernel_timestamp_result_t tsResult;
176+
CHECK_ZE_RESULT(zeEventQueryKernelTimestamp(zeEvent, &tsResult));
177+
178+
if constexpr (std::is_same_v<Param, imex::profiling::command_start>) {
179+
uint64_t startTime =
180+
(tsResult.global.kernelStart & zeTimestampMaxValue_) *
181+
zeTimerResolution_;
182+
return startTime;
183+
}
184+
185+
if constexpr (std::is_same_v<Param, imex::profiling::command_end>) {
186+
uint64_t startTime = tsResult.global.kernelStart & zeTimestampMaxValue_;
187+
uint64_t endTime = tsResult.global.kernelEnd & zeTimestampMaxValue_;
188+
189+
if (endTime < startTime)
190+
endTime += zeTimestampMaxValue_;
191+
192+
endTime *= zeTimerResolution_;
193+
return endTime;
194+
}
195+
}
196+
197+
~Event() { CHECK_ZE_RESULT(zeEventDestroy(zeEvent)); }
198+
};
199+
112200
struct GPUL0QUEUE {
113201

114202
ze_driver_handle_t zeDriver_ = nullptr;
@@ -130,6 +218,7 @@ struct GPUL0QUEUE {
130218
CHECK_ZE_RESULT(zeCommandListCreateImmediate(zeContext_, zeDevice_, &desc,
131219
&zeCommandList_));
132220
}
221+
133222
GPUL0QUEUE(ze_device_type_t *deviceType, ze_context_handle_t context) {
134223
auto driverAndDevice = getDriverAndDevice(*deviceType);
135224
zeDriver_ = driverAndDevice.first;
@@ -142,6 +231,7 @@ struct GPUL0QUEUE {
142231
CHECK_ZE_RESULT(zeCommandListCreateImmediate(zeContext_, zeDevice_, &desc,
143232
&zeCommandList_));
144233
}
234+
145235
GPUL0QUEUE(ze_device_type_t *deviceType) {
146236

147237
auto driverAndDevice = getDriverAndDevice(*deviceType);
@@ -157,6 +247,7 @@ struct GPUL0QUEUE {
157247
CHECK_ZE_RESULT(zeCommandListCreateImmediate(zeContext_, zeDevice_, &desc,
158248
&zeCommandList_));
159249
}
250+
160251
GPUL0QUEUE(ze_context_handle_t context) {
161252

162253
auto driverAndDevice = getDriverAndDevice();
@@ -231,26 +322,83 @@ getKernel(GPUL0QUEUE *queue, ze_module_handle_t module, const char *name) {
231322
return zeKernel;
232323
}
233324

325+
static void enqueueKernel(ze_command_list_handle_t zeCommandList,
326+
ze_kernel_handle_t kernel,
327+
const ze_group_count_t *pLaunchArgs,
328+
ParamDesc *params, ze_event_handle_t event = nullptr,
329+
uint32_t numWaitEvents = 0,
330+
ze_event_handle_t *phWaitEvents = nullptr) {
331+
auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
332+
for (size_t i = 0; i < paramsCount; ++i) {
333+
auto param = params[i];
334+
CHECK_ZE_RESULT(zeKernelSetArgumentValue(kernel, static_cast<uint32_t>(i),
335+
param.size, param.data));
336+
}
337+
338+
CHECK_ZE_RESULT(zeCommandListAppendLaunchKernel(
339+
zeCommandList, kernel, pLaunchArgs, event, numWaitEvents, phWaitEvents));
340+
}
341+
234342
static void launchKernel(GPUL0QUEUE *queue, ze_kernel_handle_t kernel,
235343
size_t gridX, size_t gridY, size_t gridZ,
236344
size_t blockX, size_t blockY, size_t blockZ,
237345
size_t sharedMemBytes, ParamDesc *params) {
238346
assert(kernel);
239-
auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
240347

241348
auto castSz = [](size_t val) { return static_cast<uint32_t>(val); };
242349

243350
CHECK_ZE_RESULT(zeKernelSetGroupSize(kernel, castSz(blockX), castSz(blockY),
244351
castSz(blockZ)));
245-
for (size_t i = 0; i < paramsCount; ++i) {
246-
auto param = params[i];
247-
CHECK_ZE_RESULT(zeKernelSetArgumentValue(kernel, static_cast<uint32_t>(i),
248-
param.size, param.data));
249-
}
250-
251352
ze_group_count_t launchArgs = {castSz(gridX), castSz(gridY), castSz(gridZ)};
252-
CHECK_ZE_RESULT(zeCommandListAppendLaunchKernel(
253-
queue->zeCommandList_, kernel, &launchArgs, nullptr, 0, nullptr));
353+
354+
if (getenv("IMEX_ENABLE_PROFILING")) {
355+
auto executionTime = 0.0f;
356+
auto maxTime = 0.0f;
357+
auto minTime = FLT_MAX;
358+
auto rounds = 1000;
359+
auto warmups = 3;
360+
361+
if (getenv("IMEX_PROFILING_RUNS")) {
362+
auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
363+
if (runs)
364+
rounds = runs;
365+
}
366+
367+
if (getenv("IMEX_PROFILING_WARMUPS")) {
368+
auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
369+
if (warmups)
370+
warmups = runs;
371+
}
372+
373+
// warmup
374+
for (int r = 0; r < warmups; r++)
375+
enqueueKernel(queue->zeCommandList_, kernel, &launchArgs, params, nullptr,
376+
0, nullptr);
377+
378+
// profiling using timestamp event privided by level-zero
379+
for (int r = 0; r < rounds; r++) {
380+
Event event(queue->zeContext_, queue->zeDevice_);
381+
enqueueKernel(queue->zeCommandList_, kernel, &launchArgs, params,
382+
event.zeEvent, 0, nullptr);
383+
384+
auto startTime =
385+
event.get_profiling_info<imex::profiling::command_start>();
386+
auto endTime = event.get_profiling_info<imex::profiling::command_end>();
387+
auto duration = float(endTime - startTime) / 1000000.0f;
388+
executionTime += duration;
389+
if (duration > maxTime)
390+
maxTime = duration;
391+
if (duration < minTime)
392+
minTime = duration;
393+
}
394+
fprintf(stdout,
395+
"the kernel execution time is (ms, on L0 runtime):"
396+
"avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
397+
executionTime / rounds, minTime, maxTime, rounds);
398+
} else {
399+
enqueueKernel(queue->zeCommandList_, kernel, &launchArgs, params, nullptr,
400+
0, nullptr);
401+
}
254402
}
255403

256404
// Wrappers

lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <array>
1717
#include <atomic>
1818
#include <cassert>
19+
#include <cfloat>
1920
#include <cstdint>
2021
#include <cstdio>
2122
#include <cstdlib>
@@ -189,6 +190,20 @@ static sycl::kernel *getKernel(GPUSYCLQUEUE *queue, ze_module_handle_t zeModule,
189190
return syclKernel;
190191
}
191192

193+
static sycl::event enqueueKernel(sycl::queue queue, sycl::kernel *kernel,
194+
sycl::nd_range<3> NdRange, ParamDesc *params) {
195+
auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
196+
sycl::event event = queue.submit([&](sycl::handler &cgh) {
197+
for (size_t i = 0; i < paramsCount; i++) {
198+
auto param = params[i];
199+
cgh.set_arg(static_cast<uint32_t>(i),
200+
*(static_cast<void **>(param.data)));
201+
}
202+
cgh.parallel_for(NdRange, *kernel);
203+
});
204+
return event;
205+
}
206+
192207
static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
193208
size_t gridX, size_t gridY, size_t gridZ,
194209
size_t blockX, size_t blockY, size_t blockZ,
@@ -200,29 +215,32 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
200215
sycl::nd_range<3> syclNdRange(
201216
sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
202217

203-
auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
204-
205218
if (getenv("IMEX_ENABLE_PROFILING")) {
206219
auto executionTime = 0.0f;
207220
auto maxTime = 0.0f;
208-
auto minTime = 10000.0f;
209-
auto rounds = 1000;
221+
auto minTime = FLT_MAX;
222+
auto rounds = 100;
223+
auto warmups = 3;
210224

211225
if (getenv("IMEX_PROFILING_RUNS")) {
212226
auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
213227
if (runs)
214228
rounds = runs;
215229
}
216230

231+
if (getenv("IMEX_PROFILING_WARMUPS")) {
232+
auto runs = strtol(getenv("IMEX_PROFILING_WARMUPS"), NULL, 10L);
233+
if (warmups)
234+
warmups = runs;
235+
}
236+
237+
// warmups
238+
for (int r = 0; r < warmups; r++) {
239+
enqueueKernel(syclQueue, kernel, syclNdRange, params);
240+
}
241+
217242
for (int r = 0; r < rounds; r++) {
218-
sycl::event event = syclQueue.submit([&](sycl::handler &cgh) {
219-
for (size_t i = 0; i < paramsCount; i++) {
220-
auto param = params[i];
221-
cgh.set_arg(static_cast<uint32_t>(i),
222-
*(static_cast<void **>(param.data)));
223-
}
224-
cgh.parallel_for(syclNdRange, *kernel);
225-
});
243+
sycl::event event = enqueueKernel(syclQueue, kernel, syclNdRange, params);
226244

227245
auto startTime = event.get_profiling_info<
228246
cl::sycl::info::event_profiling::command_start>();
@@ -235,19 +253,13 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
235253
if (gap < minTime)
236254
minTime = gap;
237255
}
256+
238257
fprintf(stdout,
239258
"the kernel execution time is (ms):"
240259
"avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n",
241260
executionTime / rounds, minTime, maxTime, rounds);
242261
} else {
243-
syclQueue.submit([&](sycl::handler &cgh) {
244-
for (size_t i = 0; i < paramsCount; i++) {
245-
auto param = params[i];
246-
cgh.set_arg(static_cast<uint32_t>(i),
247-
*(static_cast<void **>(param.data)));
248-
}
249-
cgh.parallel_for(syclNdRange, *kernel);
250-
});
262+
enqueueKernel(syclQueue, kernel, syclNdRange, params);
251263
}
252264
}
253265

0 commit comments

Comments
 (0)