@@ -190,34 +190,51 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
190190 size_t blockX, size_t blockY, size_t blockZ,
191191 size_t sharedMemBytes, ParamDesc *params) {
192192 auto syclQueue = queue->syclQueue_ ;
193- auto syclGlobalRange =
194- ::sycl::range<3 >(blockZ * gridZ, blockY * gridY, blockX * gridX);
193+ auto syclGlobalRange = ::sycl::range<3 >(blockZ * gridZ, blockY * gridY, blockX * gridX);
195194 auto syclLocalRange = ::sycl::range<3 >(blockZ, blockY, blockX);
196- sycl::nd_range<3 > syclNdRange (
197- sycl::nd_range<3 >(syclGlobalRange, syclLocalRange));
198-
195+ sycl::nd_range<3 > syclNdRange (sycl::nd_range<3 >(syclGlobalRange, syclLocalRange));
196+
199197 auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
200198
201- sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
202- for (size_t i = 0 ; i < paramsCount; i++) {
203- auto param = params[i];
204- cgh.set_arg (static_cast <uint32_t >(i),
205- *(static_cast <void **>(param.data )));
206- }
207- cgh.parallel_for (syclNdRange, *kernel);
208- });
209199 if (getenv (" IMEX_ENABLE_PROFILING" )) {
210- // auto submitTime = event.get_profiling_info<
211- // cl::sycl::info::event_profiling::command_submit>();
212- auto startTime = event.get_profiling_info <
213- cl::sycl::info::event_profiling::command_start>();
214- auto endTime =
215- event
216- .get_profiling_info <cl::sycl::info::event_profiling::command_end>();
217- // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218- // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219- auto executionTime = float (endTime - startTime) / 1000000 .0f ;
220- fprintf (stdout, " the kernel execution time is %f ms\n " , executionTime);
200+ auto executionTime = 0 .0f ;
201+ auto maxTime = 0 .0f ;
202+ auto minTime = 10000 .0f ;
203+ auto rounds = 1000 ;
204+
205+ if (getenv (" IMEX_PROFILING_RUNS" )) {
206+ auto runs = strtol (getenv (" IMEX_PROFILING_RUNS" ), NULL , 10L );
207+ if (runs) rounds = runs;
208+ }
209+
210+ for (int r = 0 ; r < rounds; r++) {
211+ sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
212+ for (size_t i = 0 ; i < paramsCount; i++) {
213+ auto param = params[i];
214+ cgh.set_arg (static_cast <uint32_t >(i),
215+ *(static_cast <void **>(param.data )));
216+ }
217+ cgh.parallel_for (syclNdRange, *kernel);
218+ });
219+
220+ auto startTime = event.get_profiling_info <cl::sycl::info::event_profiling::command_start>();
221+ auto endTime = event.get_profiling_info <cl::sycl::info::event_profiling::command_end>();
222+ auto gap = float (endTime - startTime) / 1000000 .0f ;
223+ executionTime += gap;
224+ if (gap > maxTime) maxTime = gap;
225+ if (gap < minTime) minTime = gap;
226+ }
227+ fprintf (stdout, " the kernel execution time is (ms): avg: %.4f, min: %.4f, max: %.4f (over %ld runs)\n " , \
228+ executionTime/rounds, minTime, maxTime, rounds);
229+ } else {
230+ syclQueue.submit ([&](sycl::handler &cgh) {
231+ for (size_t i = 0 ; i < paramsCount; i++) {
232+ auto param = params[i];
233+ cgh.set_arg (static_cast <uint32_t >(i),
234+ *(static_cast <void **>(param.data )));
235+ }
236+ cgh.parallel_for (syclNdRange, *kernel);
237+ });
221238 }
222239}
223240
0 commit comments