@@ -198,26 +198,52 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
198198
199199 auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
200200
201- sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
202- for (size_t i = 0 ; i < paramsCount; i++) {
203- auto param = params[i];
204- cgh.set_arg (static_cast <uint32_t >(i),
205- *(static_cast <void **>(param.data )));
206- }
207- cgh.parallel_for (syclNdRange, *kernel);
208- });
209201 if (getenv (" IMEX_ENABLE_PROFILING" )) {
210- // auto submitTime = event.get_profiling_info<
211- // cl::sycl::info::event_profiling::command_submit>();
212- auto startTime = event.get_profiling_info <
213- cl::sycl::info::event_profiling::command_start>();
214- auto endTime =
215- event
216- .get_profiling_info <cl::sycl::info::event_profiling::command_end>();
217- // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218- // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219- auto executionTime = float (endTime - startTime) / 1000000 .0f ;
220- fprintf (stdout, " the kernel execution time is %f ms\n " , executionTime);
202+ auto executionTime = 0 .0f ;
203+ auto maxTime = 0 .0f ;
204+ auto minTime = 10000 .0f ;
205+ auto rounds = 1000 ;
206+
207+ if (getenv (" IMEX_PROFILING_RUNS" )) {
208+ auto runs = strtol (getenv (" IMEX_PROFILING_RUNS" ), NULL , 10L );
209+ if (runs)
210+ rounds = runs;
211+ }
212+
213+ for (int r = 0 ; r < rounds; r++) {
214+ sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
215+ for (size_t i = 0 ; i < paramsCount; i++) {
216+ auto param = params[i];
217+ cgh.set_arg (static_cast <uint32_t >(i),
218+ *(static_cast <void **>(param.data )));
219+ }
220+ cgh.parallel_for (syclNdRange, *kernel);
221+ });
222+
223+ auto startTime = event.get_profiling_info <
224+ cl::sycl::info::event_profiling::command_start>();
225+ auto endTime = event.get_profiling_info <
226+ cl::sycl::info::event_profiling::command_end>();
227+ auto gap = float (endTime - startTime) / 1000000 .0f ;
228+ executionTime += gap;
229+ if (gap > maxTime)
230+ maxTime = gap;
231+ if (gap < minTime)
232+ minTime = gap;
233+ }
234+ fprintf (stdout,
235+ " the kernel execution time is (ms):"
236+ " avg: %.4f, min: %.4f, max: %.4f (over %ld runs)\n " ,
237+ executionTime / rounds, minTime, maxTime, rounds);
238+ } else {
239+ syclQueue.submit ([&](sycl::handler &cgh) {
240+ for (size_t i = 0 ; i < paramsCount; i++) {
241+ auto param = params[i];
242+ cgh.set_arg (static_cast <uint32_t >(i),
243+ *(static_cast <void **>(param.data )));
244+ }
245+ cgh.parallel_for (syclNdRange, *kernel);
246+ });
221247 }
222248}
223249
0 commit comments