@@ -99,30 +99,31 @@ struct GPUSYCLQUEUE {
99
99
sycl::context syclContext_;
100
100
sycl::queue syclQueue_;
101
101
102
- GPUSYCLQUEUE () {
102
+ GPUSYCLQUEUE (sycl::property_list propList ) {
103
103
104
104
syclDevice_ = getDefaultDevice ();
105
105
syclContext_ = sycl::context (syclDevice_);
106
- syclQueue_ = sycl::queue (syclContext_, syclDevice_);
106
+ syclQueue_ = sycl::queue (syclContext_, syclDevice_, propList );
107
107
}
108
108
109
- GPUSYCLQUEUE (sycl::device *device, sycl::context *context) {
109
+ GPUSYCLQUEUE (sycl::device *device, sycl::context *context,
110
+ sycl::property_list propList) {
110
111
syclDevice_ = *device;
111
112
syclContext_ = *context;
112
- syclQueue_ = sycl::queue (syclContext_, syclDevice_);
113
+ syclQueue_ = sycl::queue (syclContext_, syclDevice_, propList );
113
114
}
114
- GPUSYCLQUEUE (sycl::device *device) {
115
+ GPUSYCLQUEUE (sycl::device *device, sycl::property_list propList ) {
115
116
116
117
syclDevice_ = *device;
117
118
syclContext_ = sycl::context (syclDevice_);
118
- syclQueue_ = sycl::queue (syclContext_, syclDevice_);
119
+ syclQueue_ = sycl::queue (syclContext_, syclDevice_, propList );
119
120
}
120
121
121
- GPUSYCLQUEUE (sycl::context *context) {
122
+ GPUSYCLQUEUE (sycl::context *context, sycl::property_list propList ) {
122
123
123
124
syclDevice_ = getDefaultDevice ();
124
125
syclContext_ = *context;
125
- syclQueue_ = sycl::queue (syclContext_, syclDevice_);
126
+ syclQueue_ = sycl::queue (syclContext_, syclDevice_, propList );
126
127
}
127
128
128
129
}; // end of GPUSYCLQUEUE
@@ -197,32 +198,49 @@ static void launchKernel(GPUSYCLQUEUE *queue, sycl::kernel *kernel,
197
198
198
199
auto paramsCount = countUntil (params, ParamDesc{nullptr , 0 });
199
200
200
- syclQueue.submit ([&](sycl::handler &cgh) {
201
+ sycl::event event = syclQueue.submit ([&](sycl::handler &cgh) {
201
202
for (size_t i = 0 ; i < paramsCount; i++) {
202
203
auto param = params[i];
203
204
cgh.set_arg (static_cast <uint32_t >(i),
204
205
*(static_cast <void **>(param.data )));
205
206
}
206
207
cgh.parallel_for (syclNdRange, *kernel);
207
208
});
209
+ if (getenv (" IMEX_ENABLE_PROFILING" )) {
210
+ // auto submitTime = event.get_profiling_info<
211
+ // cl::sycl::info::event_profiling::command_submit>();
212
+ auto startTime = event.get_profiling_info <
213
+ cl::sycl::info::event_profiling::command_start>();
214
+ auto endTime =
215
+ event
216
+ .get_profiling_info <cl::sycl::info::event_profiling::command_end>();
217
+ // auto submissionTime = float(startTime - submitTime) / 1000000.0f;
218
+ // fprintf(stdout, "the kernel submission time is %f ms\n", submissionTime);
219
+ auto executionTime = float (endTime - startTime) / 1000000 .0f ;
220
+ fprintf (stdout, " the kernel execution time is %f ms\n " , executionTime);
221
+ }
208
222
}
209
223
210
224
// Wrappers
211
225
212
226
extern " C" SYCL_RUNTIME_EXPORT GPUSYCLQUEUE *gpuCreateStream (void *device,
213
227
void *context) {
228
+ auto propList = sycl::property_list{};
229
+ if (getenv (" IMEX_ENABLE_PROFILING" )) {
230
+ propList = sycl::property_list{sycl::property::queue::enable_profiling ()};
231
+ }
214
232
return catchAll ([&]() {
215
233
if (!device && !context) {
216
- return new GPUSYCLQUEUE ();
234
+ return new GPUSYCLQUEUE (propList );
217
235
} else if (device && context) {
218
236
// TODO: Check if the pointers/address is valid and holds the correct
219
237
// device and context
220
238
return new GPUSYCLQUEUE (static_cast <sycl::device *>(device),
221
- static_cast <sycl::context *>(context));
239
+ static_cast <sycl::context *>(context), propList );
222
240
} else if (device && !context) {
223
- return new GPUSYCLQUEUE (static_cast <sycl::device *>(device));
241
+ return new GPUSYCLQUEUE (static_cast <sycl::device *>(device), propList );
224
242
} else {
225
- return new GPUSYCLQUEUE (static_cast <sycl::context *>(context));
243
+ return new GPUSYCLQUEUE (static_cast <sycl::context *>(context), propList );
226
244
}
227
245
});
228
246
}
0 commit comments