Skip to content

Commit a1c7870

Browse files
committed
run kernel multiple times in profling mode
1 parent 3f26f14 commit a1c7870

File tree

1 file changed

+41
-24
lines changed

1 file changed

+41
-24
lines changed

lib/ExecutionEngine/SYCLRUNTIME/SyclRuntimeWrappers.cpp

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -190,34 +190,51 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
190190
size_t blockX, size_t blockY, size_t blockZ,
191191
size_t sharedMemBytes, ParamDesc *params) {
192192
auto syclQueue = queue->syclQueue_;
193-
auto syclGlobalRange =
194-
::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
193+
auto syclGlobalRange = ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
195194
auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
196-
sycl::nd_range<3> syclNdRange(
197-
sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
198-
195+
sycl::nd_range<3> syclNdRange(sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
196+
199197
auto paramsCount = countUntil(params, ParamDesc{nullptr, 0});
200198

201-
sycl::event event = syclQueue.submit([&](sycl::handler &cgh) {
202-
for (size_t i = 0; i < paramsCount; i++) {
203-
auto param = params[i];
204-
cgh.set_arg(static_cast<uint32_t>(i),
205-
*(static_cast<void **>(param.data)));
206-
}
207-
cgh.parallel_for(syclNdRange, *kernel);
208-
});
209199
if (getenv("IMEX_ENABLE_PROFILING")) {
210-
// auto submitTime = event.get_profiling_info<
211-
// cl::sycl::info::event_profiling::command_submit>();
212-
auto startTime = event.get_profiling_info<
213-
cl::sycl::info::event_profiling::command_start>();
214-
auto endTime =
215-
event
216-
.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
217-
// auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218-
// fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219-
auto executionTime = float(endTime - startTime) / 1000000.0f;
220-
fprintf(stdout, "the kernel execution time is %f ms\n", executionTime);
200+
auto executionTime = 0.0f;
201+
auto maxTime = 0.0f;
202+
auto minTime = 10000.0f;
203+
auto rounds = 1000;
204+
205+
if (getenv("IMEX_PROFILING_RUNS")) {
206+
auto runs = strtol(getenv("IMEX_PROFILING_RUNS"), NULL, 10L);
207+
if (runs) rounds = runs;
208+
}
209+
210+
for (int r = 0; r < rounds; r++) {
211+
sycl::event event = syclQueue.submit([&](sycl::handler &cgh) {
212+
for (size_t i = 0; i < paramsCount; i++) {
213+
auto param = params[i];
214+
cgh.set_arg(static_cast<uint32_t>(i),
215+
*(static_cast<void **>(param.data)));
216+
}
217+
cgh.parallel_for(syclNdRange, *kernel);
218+
});
219+
220+
auto startTime = event.get_profiling_info<cl::sycl::info::event_profiling::command_start>();
221+
auto endTime = event.get_profiling_info<cl::sycl::info::event_profiling::command_end>();
222+
auto gap = float(endTime - startTime) / 1000000.0f;
223+
executionTime += gap;
224+
if (gap > maxTime) maxTime = gap;
225+
if (gap < minTime) minTime = gap;
226+
}
227+
fprintf(stdout, "the kernel execution time is (ms): avg: %.4f, min: %.4f, max: %.4f (over %ld runs)\n", \
228+
executionTime/rounds, minTime, maxTime, rounds);
229+
} else {
230+
syclQueue.submit([&](sycl::handler &cgh) {
231+
for (size_t i = 0; i < paramsCount; i++) {
232+
auto param = params[i];
233+
cgh.set_arg(static_cast<uint32_t>(i),
234+
*(static_cast<void **>(param.data)));
235+
}
236+
cgh.parallel_for(syclNdRange, *kernel);
237+
});
221238
}
222239
}
223240

0 commit comments

Comments
 (0)