16
16
#include < array>
17
17
#include < atomic>
18
18
#include < cassert>
19
+ #include < cfloat>
19
20
#include < cstdint>
20
21
#include < cstdio>
21
22
#include < cstdlib>
23
+ #include < memory>
22
24
#include < stdexcept>
23
25
#include < tuple>
24
26
#include < vector>
@@ -109,6 +111,92 @@ getDriverAndDevice(ze_device_type_t deviceType = ZE_DEVICE_TYPE_GPU) {
109
111
throw std::runtime_error (" getDevice failed" );
110
112
}
111
113
114
+ #define _IMEX_PROFILING_TRAITS_SPEC (Desc ) \
115
+ struct Desc {};
116
+
117
+ namespace imex {
118
+ namespace profiling {
119
+ // defining two types representing kernel start and kernel end
120
+ _IMEX_PROFILING_TRAITS_SPEC (command_start);
121
+ _IMEX_PROFILING_TRAITS_SPEC (command_end);
122
+ } // namespace profiling
123
+ } // namespace imex
124
+
125
+ // A Timestamp event pool management class. It currently simply represents
126
+ // a event pool with fixed 256 slots. Currently for each run we just need
127
+ // one timing event, but we definity need a sophisticated event system in
128
+ // the future for programs with multiple kernels.
129
+ struct EventPool {
130
+ ze_event_pool_handle_t zeEventPool;
131
+
132
+ EventPool (ze_context_handle_t zeContext_) {
133
+ ze_event_pool_desc_t tsEventPoolDesc = {
134
+ ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, nullptr ,
135
+ ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP, 256 };
136
+ CHECK_ZE_RESULT (zeEventPoolCreate (zeContext_, &tsEventPoolDesc, 0 , nullptr ,
137
+ &zeEventPool));
138
+ }
139
+
140
+ ~EventPool () { CHECK_ZE_RESULT (zeEventPoolDestroy (zeEventPool)); }
141
+ };
142
+
143
+ // A wrapper to ze_event_handle_t providing timestamp queries
144
+ class Event {
145
+ private:
146
+ uint64_t zeTimestampMaxValue_;
147
+ uint64_t zeTimerResolution_;
148
+
149
+ public:
150
+ ze_event_handle_t zeEvent;
151
+
152
+ Event (ze_context_handle_t zeContext_, ze_device_handle_t zeDevice_) {
153
+ static EventPool pool (zeContext_);
154
+
155
+ // timestamp and timer resolution is a device properties.
156
+ // They are required to compute the final wall time.
157
+ ze_device_properties_t deviceProperties{};
158
+ deviceProperties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
159
+ CHECK_ZE_RESULT (zeDeviceGetProperties (zeDevice_, &deviceProperties));
160
+ zeTimestampMaxValue_ =
161
+ ((1ULL << deviceProperties.kernelTimestampValidBits ) - 1ULL );
162
+ zeTimerResolution_ = deviceProperties.timerResolution ;
163
+
164
+ ze_event_desc_t eventDesc = {
165
+ ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr ,
166
+ 0 , // index
167
+ 0 , // no additional memory/cache coherency required on signal
168
+ 0 // no additional memory/cache coherency required on wait
169
+ };
170
+ CHECK_ZE_RESULT (zeEventCreate (pool.zeEventPool , &eventDesc, &zeEvent));
171
+ }
172
+
173
+ // query the kernel start or end (specified via Param) timestamp
174
+ template <typename Param> uint64_t get_profiling_info () {
175
+ ze_kernel_timestamp_result_t tsResult;
176
+ CHECK_ZE_RESULT (zeEventQueryKernelTimestamp (zeEvent, &tsResult));
177
+
178
+ if constexpr (std::is_same_v<Param, imex::profiling::command_start>) {
179
+ uint64_t startTime =
180
+ (tsResult.global .kernelStart & zeTimestampMaxValue_) *
181
+ zeTimerResolution_;
182
+ return startTime;
183
+ }
184
+
185
+ if constexpr (std::is_same_v<Param, imex::profiling::command_end>) {
186
+ uint64_t startTime = tsResult.global .kernelStart & zeTimestampMaxValue_;
187
+ uint64_t endTime = tsResult.global .kernelEnd & zeTimestampMaxValue_;
188
+
189
+ if (endTime < startTime)
190
+ endTime += zeTimestampMaxValue_;
191
+
192
+ endTime *= zeTimerResolution_;
193
+ return endTime;
194
+ }
195
+ }
196
+
197
+ ~Event () { CHECK_ZE_RESULT (zeEventDestroy (zeEvent)); }
198
+ };
199
+
112
200
struct GPUL0QUEUE {
113
201
114
202
ze_driver_handle_t zeDriver_ = nullptr ;
@@ -130,6 +218,7 @@ struct GPUL0QUEUE {
130
218
CHECK_ZE_RESULT (zeCommandListCreateImmediate (zeContext_, zeDevice_, &desc,
131
219
&zeCommandList_));
132
220
}
221
+
133
222
GPUL0QUEUE (ze_device_type_t *deviceType, ze_context_handle_t context) {
134
223
auto driverAndDevice = getDriverAndDevice (*deviceType);
135
224
zeDriver_ = driverAndDevice.first ;
@@ -142,6 +231,7 @@ struct GPUL0QUEUE {
142
231
CHECK_ZE_RESULT (zeCommandListCreateImmediate (zeContext_, zeDevice_, &desc,
143
232
&zeCommandList_));
144
233
}
234
+
145
235
GPUL0QUEUE (ze_device_type_t *deviceType) {
146
236
147
237
auto driverAndDevice = getDriverAndDevice (*deviceType);
@@ -157,6 +247,7 @@ struct GPUL0QUEUE {
157
247
CHECK_ZE_RESULT (zeCommandListCreateImmediate (zeContext_, zeDevice_, &desc,
158
248
&zeCommandList_));
159
249
}
250
+
160
251
GPUL0QUEUE (ze_context_handle_t context) {
161
252
162
253
auto driverAndDevice = getDriverAndDevice ();
@@ -231,26 +322,83 @@ getKernel(GPUL0QUEUE *queue, ze_module_handle_t module, const char *name) {
231
322
return zeKernel;
232
323
}
233
324
325
+ static void enqueueKernel (ze_command_list_handle_t zeCommandList,
326
+ ze_kernel_handle_t kernel,
327
+ const ze_group_count_t *pLaunchArgs,
328
+ ParamDesc *params, ze_event_handle_t event = nullptr ,
329
+ uint32_t numWaitEvents = 0 ,
330
+ ze_event_handle_t *phWaitEvents = nullptr ) {
331
+ auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
332
+ for (size_t i = 0 ; i < paramsCount; ++i) {
333
+ auto param = params[i];
334
+ CHECK_ZE_RESULT (zeKernelSetArgumentValue (kernel, static_cast <uint32_t >(i),
335
+ param.size , param.data ));
336
+ }
337
+
338
+ CHECK_ZE_RESULT (zeCommandListAppendLaunchKernel (
339
+ zeCommandList, kernel, pLaunchArgs, event, numWaitEvents, phWaitEvents));
340
+ }
341
+
234
342
static void launchKernel (GPUL0QUEUE *queue, ze_kernel_handle_t kernel,
235
343
size_t gridX, size_t gridY, size_t gridZ,
236
344
size_t blockX, size_t blockY, size_t blockZ,
237
345
size_t sharedMemBytes, ParamDesc *params) {
238
346
assert (kernel);
239
- auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
240
347
241
348
auto castSz = [](size_t val) { return static_cast <uint32_t >(val); };
242
349
243
350
CHECK_ZE_RESULT (zeKernelSetGroupSize (kernel, castSz (blockX), castSz (blockY),
244
351
castSz (blockZ)));
245
- for (size_t i = 0 ; i < paramsCount; ++i) {
246
- auto param = params[i];
247
- CHECK_ZE_RESULT (zeKernelSetArgumentValue (kernel, static_cast <uint32_t >(i),
248
- param.size , param.data ));
249
- }
250
-
251
352
ze_group_count_t launchArgs = {castSz (gridX), castSz (gridY), castSz (gridZ)};
252
- CHECK_ZE_RESULT (zeCommandListAppendLaunchKernel (
253
- queue->zeCommandList_ , kernel, &launchArgs, nullptr , 0 , nullptr ));
353
+
354
+ if (getenv (" IMEX_ENABLE_PROFILING" )) {
355
+ auto executionTime = 0 .0f ;
356
+ auto maxTime = 0 .0f ;
357
+ auto minTime = FLT_MAX;
358
+ auto rounds = 1000 ;
359
+ auto warmups = 3 ;
360
+
361
+ if (getenv (" IMEX_PROFILING_RUNS" )) {
362
+ auto runs = strtol (getenv (" IMEX_PROFILING_RUNS" ), NULL , 10L );
363
+ if (runs)
364
+ rounds = runs;
365
+ }
366
+
367
+ if (getenv (" IMEX_PROFILING_WARMUPS" )) {
368
+ auto runs = strtol (getenv (" IMEX_PROFILING_WARMUPS" ), NULL , 10L );
369
+ if (warmups)
370
+ warmups = runs;
371
+ }
372
+
373
+ // warmup
374
+ for (int r = 0 ; r < warmups; r++)
375
+ enqueueKernel (queue->zeCommandList_ , kernel, &launchArgs, params, nullptr ,
376
+ 0 , nullptr );
377
+
378
+ // profiling using timestamp event privided by level-zero
379
+ for (int r = 0 ; r < rounds; r++) {
380
+ Event event (queue->zeContext_ , queue->zeDevice_ );
381
+ enqueueKernel (queue->zeCommandList_ , kernel, &launchArgs, params,
382
+ event.zeEvent , 0 , nullptr );
383
+
384
+ auto startTime =
385
+ event.get_profiling_info <imex::profiling::command_start>();
386
+ auto endTime = event.get_profiling_info <imex::profiling::command_end>();
387
+ auto duration = float (endTime - startTime) / 1000000 .0f ;
388
+ executionTime += duration;
389
+ if (duration > maxTime)
390
+ maxTime = duration;
391
+ if (duration < minTime)
392
+ minTime = duration;
393
+ }
394
+ fprintf (stdout,
395
+ " the kernel execution time is (ms, on L0 runtime):"
396
+ " avg: %.4f, min: %.4f, max: %.4f (over %d runs)\n " ,
397
+ executionTime / rounds, minTime, maxTime, rounds);
398
+ } else {
399
+ enqueueKernel (queue->zeCommandList_ , kernel, &launchArgs, params, nullptr ,
400
+ 0 , nullptr );
401
+ }
254
402
}
255
403
256
404
// Wrappers
0 commit comments