@@ -190,34 +190,51 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
190
190
size_t blockX, size_t blockY, size_t blockZ,
191
191
size_t sharedMemBytes, ParamDesc *params) {
192
192
auto syclQueue = queue->syclQueue_ ;
193
- auto syclGlobalRange =
194
- ::sycl::range<3 >(blockZ * gridZ, blockY * gridY, blockX * gridX);
193
+ auto syclGlobalRange = ::sycl::range<3 >(blockZ * gridZ, blockY * gridY, blockX * gridX);
195
194
auto syclLocalRange = ::sycl::range<3 >(blockZ, blockY, blockX);
196
- sycl::nd_range<3 > syclNdRange (
197
- sycl::nd_range<3 >(syclGlobalRange, syclLocalRange));
198
-
195
+ sycl::nd_range<3 > syclNdRange (sycl::nd_range<3 >(syclGlobalRange, syclLocalRange));
196
+
199
197
auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
200
198
201
- sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
202
- for (size_t i = 0 ; i < paramsCount; i++) {
203
- auto param = params[i];
204
- cgh.set_arg (static_cast <uint32_t >(i),
205
- *(static_cast <void **>(param.data )));
206
- }
207
- cgh.parallel_for (syclNdRange, *kernel);
208
- });
209
199
if (getenv (" IMEX_ENABLE_PROFILING" )) {
210
- // auto submitTime = event.get_profiling_info<
211
- // cl::sycl::info::event_profiling::command_submit>();
212
- auto startTime = event.get_profiling_info <
213
- cl::sycl::info::event_profiling::command_start>();
214
- auto endTime =
215
- event
216
- .get_profiling_info <cl::sycl::info::event_profiling::command_end>();
217
- // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218
- // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219
- auto executionTime = float (endTime - startTime) / 1000000 .0f ;
220
- fprintf (stdout, " the kernel execution time is %f ms\n " , executionTime);
200
+ auto executionTime = 0 .0f ;
201
+ auto maxTime = 0 .0f ;
202
+ auto minTime = 10000 .0f ;
203
+ auto rounds = 1000 ;
204
+
205
+ if (getenv (" IMEX_PROFILING_RUNS" )) {
206
+ auto runs = strtol (getenv (" IMEX_PROFILING_RUNS" ), NULL , 10L );
207
+ if (runs) rounds = runs;
208
+ }
209
+
210
+ for (int r = 0 ; r < rounds; r++) {
211
+ sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
212
+ for (size_t i = 0 ; i < paramsCount; i++) {
213
+ auto param = params[i];
214
+ cgh.set_arg (static_cast <uint32_t >(i),
215
+ *(static_cast <void **>(param.data )));
216
+ }
217
+ cgh.parallel_for (syclNdRange, *kernel);
218
+ });
219
+
220
+ auto startTime = event.get_profiling_info <cl::sycl::info::event_profiling::command_start>();
221
+ auto endTime = event.get_profiling_info <cl::sycl::info::event_profiling::command_end>();
222
+ auto gap = float (endTime - startTime) / 1000000 .0f ;
223
+ executionTime += gap;
224
+ if (gap > maxTime) maxTime = gap;
225
+ if (gap < minTime) minTime = gap;
226
+ }
227
+ fprintf (stdout, " the kernel execution time is (ms): avg: %.4f, min: %.4f, max: %.4f (over %ld runs)\n " , \
228
+ executionTime/rounds, minTime, maxTime, rounds);
229
+ } else {
230
+ syclQueue.submit ([&](sycl::handler &cgh) {
231
+ for (size_t i = 0 ; i < paramsCount; i++) {
232
+ auto param = params[i];
233
+ cgh.set_arg (static_cast <uint32_t >(i),
234
+ *(static_cast <void **>(param.data )));
235
+ }
236
+ cgh.parallel_for (syclNdRange, *kernel);
237
+ });
221
238
}
222
239
}
223
240
0 commit comments