@@ -198,26 +198,52 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
198
198
199
199
auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
200
200
201
- sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
202
- for (size_t i = 0 ; i < paramsCount; i++) {
203
- auto param = params[i];
204
- cgh.set_arg (static_cast <uint32_t >(i),
205
- *(static_cast <void **>(param.data )));
206
- }
207
- cgh.parallel_for (syclNdRange, *kernel);
208
- });
209
201
if (getenv (" IMEX_ENABLE_PROFILING" )) {
210
- // auto submitTime = event.get_profiling_info<
211
- // cl::sycl::info::event_profiling::command_submit>();
212
- auto startTime = event.get_profiling_info <
213
- cl::sycl::info::event_profiling::command_start>();
214
- auto endTime =
215
- event
216
- .get_profiling_info <cl::sycl::info::event_profiling::command_end>();
217
- // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218
- // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219
- auto executionTime = float (endTime - startTime) / 1000000 .0f ;
220
- fprintf (stdout, " the kernel execution time is %f ms\n " , executionTime);
202
+ auto executionTime = 0 .0f ;
203
+ auto maxTime = 0 .0f ;
204
+ auto minTime = 10000 .0f ;
205
+ auto rounds = 1000 ;
206
+
207
+ if (getenv (" IMEX_PROFILING_RUNS" )) {
208
+ auto runs = strtol (getenv (" IMEX_PROFILING_RUNS" ), NULL , 10L );
209
+ if (runs)
210
+ rounds = runs;
211
+ }
212
+
213
+ for (int r = 0 ; r < rounds; r++) {
214
+ sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
215
+ for (size_t i = 0 ; i < paramsCount; i++) {
216
+ auto param = params[i];
217
+ cgh.set_arg (static_cast <uint32_t >(i),
218
+ *(static_cast <void **>(param.data )));
219
+ }
220
+ cgh.parallel_for (syclNdRange, *kernel);
221
+ });
222
+
223
+ auto startTime = event.get_profiling_info <
224
+ cl::sycl::info::event_profiling::command_start>();
225
+ auto endTime = event.get_profiling_info <
226
+ cl::sycl::info::event_profiling::command_end>();
227
+ auto gap = float (endTime - startTime) / 1000000 .0f ;
228
+ executionTime += gap;
229
+ if (gap > maxTime)
230
+ maxTime = gap;
231
+ if (gap < minTime)
232
+ minTime = gap;
233
+ }
234
+ fprintf (stdout,
235
+ " the kernel execution time is (ms):"
236
+ " avg: %.4f, min: %.4f, max: %.4f (over %ld runs)\n " ,
237
+ executionTime / rounds, minTime, maxTime, rounds);
238
+ } else {
239
+ syclQueue.submit ([&](sycl::handler &cgh) {
240
+ for (size_t i = 0 ; i < paramsCount; i++) {
241
+ auto param = params[i];
242
+ cgh.set_arg (static_cast <uint32_t >(i),
243
+ *(static_cast <void **>(param.data )));
244
+ }
245
+ cgh.parallel_for (syclNdRange, *kernel);
246
+ });
221
247
}
222
248
}
223
249
0 commit comments