|
70 | 70 | "\n",
|
71 | 71 | "```cpp\n",
|
72 | 72 | "\n",
|
73 |
| - "#include <cuda.h>\n", |
74 |
| - "#include <iostream>\n", |
75 |
| - "#include <vector>\n", |
76 |
| - "#define N 16\n", |
77 |
| - "\n", |
78 |
| - "//# kernel code to perform VectorAdd on GPU\n", |
79 |
| - "__global__ void VectorAddKernel(float* A, float* B, float* C)\n", |
80 |
| - "{\n", |
81 |
| - " C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\n", |
82 |
| - "}\n", |
83 |
| - "\n", |
84 |
| - "int main()\n", |
85 |
| - "{\n", |
| 73 | + "#include <cuda.h>\r\n", |
| 74 | + "#include <iostream>\r\n", |
| 75 | + "#include <vector>\r\n", |
| 76 | + "#define N 16\r\n", |
| 77 | + "\r\n", |
| 78 | + "//# kernel code to perform VectorAdd on GPU\r\n", |
| 79 | + "__global__ void VectorAddKernel(float* A, float* B, float* C)\r\n", |
| 80 | + "{\r\n", |
| 81 | + " C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\r\n", |
| 82 | + "}\r\n", |
| 83 | + "\r\n", |
| 84 | + "int main()\r\n", |
| 85 | + "{\r\n", |
| 86 | + " //# Print device name\r\n", |
| 87 | + " cudaDeviceProp dev;\r\n", |
| 88 | + " cudaGetDeviceProperties(&dev, 0);\r\n", |
| 89 | + " std::cout << \"Device: \" << d\n", |
| 90 | + "ev.name << \"\\n\";\n", |
86 | 91 | " //# Initialize vectors on host\n",
|
87 | 92 | " float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
|
88 | 93 | " float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};\n",
|
|
210 | 215 | "int main()\n",
|
211 | 216 | "{\n",
|
212 | 217 | " dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
|
213 |
| - " sycl::queue &q_ct1 = dev_ct1.default_queue();\n", |
214 |
| - " std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n", |
| 218 | + " sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n", |
| 219 | + " //# Print device name\n", |
| 220 | + " dpct::device_info dev;\n", |
| 221 | + " dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\n", |
| 222 | + " std::cout << \"Device: \" << dev.get_name() << \"\\n\";\n", |
215 | 223 | "\n",
|
216 | 224 | " //# Initialize vectors on host\n",
|
217 | 225 | " float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
|
|
226 | 234 | "\n",
|
227 | 235 | " //# copy vector data from host to device\n",
|
228 | 236 | " q_ct1.memcpy(d_A, A, N * sizeof(float));\n",
|
229 |
| - " q_ct1.memcpy(d_B, B, N * sizeof(float)).wait();\n", |
| 237 | + " q_ct1.memcpy(d_B, B, N * sizeof(float));\n", |
230 | 238 | "\n",
|
231 | 239 | " //# sumbit task to compute VectorAdd on device\n",
|
232 | 240 | " q_ct1.parallel_for(\n",
|
|
243 | 251 | " std::cout << \"\\n\";\n",
|
244 | 252 | "\n",
|
245 | 253 | " //# free allocation on device\n",
|
246 |
| - " sycl::free(d_A, q_ct1);\n", |
247 |
| - " sycl::free(d_B, q_ct1);\n", |
248 |
| - " sycl::free(d_C, q_ct1);\n", |
| 254 | + " dpct::dpct_free(d_A, q_ct1);\n", |
| 255 | + " dpct::dpct_free(d_B, q_ct1);\n", |
| 256 | + " dpct::dpct_free(d_C, q_ct1);\n", |
249 | 257 | " return 0;\n",
|
250 | 258 | "}\n",
|
| 259 | + "\n", |
251 | 260 | "```\n",
|
252 | 261 | "\n",
|
253 | 262 | "The migrated SYCL code can be compiled using the following command in terminal:\n",
|
|
274 | 283 | "metadata": {},
|
275 | 284 | "outputs": [],
|
276 | 285 | "source": [
|
277 |
| - "! ./q.sh run_vector_add.sh" |
| 286 | + "! ./q.sh run_sycl_migrated.sh" |
278 | 287 | ]
|
279 | 288 | },
|
280 | 289 | {
|
|
288 | 297 | "\n",
|
289 | 298 | "| Functionality|CUDA|SYCL\n",
|
290 | 299 | "|-|-|-\n",
|
291 |
| - "| header file|`#include <cuda.h>`|`#include <CL/sycl.hpp>`\n", |
| 300 | + "| header file|`#include <cuda.h>`|`#include <sycl/sycl.hpp>`<br>`#include <dpct/dpct.hpp>`\n", |
292 | 301 | "| Memory allocation on device| `cudaMalloc(&d_A, N*sizeof(float))`| `d_A = sycl::malloc_device<float>(N, q_ct1)`\n",
|
293 | 302 | "| Copy memory between host and device| `cudaMemcpy(d_A, A, N*sizeof(float), cudaMemcpyHostToDevice)`| `q.memcpy(d_A, A, N * sizeof(float))`\n",
|
294 |
| - " | Free device memory allocation| `cudaFree(d_A)` | `free(d_A, q)`\n", |
| 303 | + " | Free device memory allocation| `cudaFree(d_A)` | `dpct::dpct_free(d_A, q)`\n", |
295 | 304 | "\n",
|
296 | 305 | "The actual kernel function invocation is different. In CUDA, the kernel function is invoked with the execution configuration syntax `<<<1, N>>>>` as follows, specifying 1 block and N threads:\n",
|
297 | 306 | "\n",
|
|
313 | 322 | "Another difference is that the SYCL requires creating a SYCL queue with a device selector and other optional properties. The queue is used to submit the command group to execute on the device. The creation of a SYCL queue is necessary and is done as follows in the SYCL migrated code using some helper functions:\n",
|
314 | 323 | "\n",
|
315 | 324 | "```cpp\n",
|
316 |
| - "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n", |
317 |
| - "sycl::queue &q_ct1 = dev_ct1.default_queue();\n", |
| 325 | + "dpct::device_ext &dev_ct1 = dpct::get_current_device();\r", |
| 326 | + " sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n", |
318 | 327 | "```\n",
|
319 | 328 | "\n",
|
320 | 329 | "In CUDA, the equivalent is a CUDA stream; if no stream is created in the CUDA code, a default stream is implicitly created.\n"
|
|
336 | 345 | "Analyzing the migrated SYCL code, we can see that a SYCL queue is created using the following code:\n",
|
337 | 346 | "\n",
|
338 | 347 | "```cpp\n",
|
339 |
| - "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n", |
340 |
| - "sycl::queue &q_ct1 = dev_ct1.default_queue();\n", |
| 348 | + "dpct::device_ext &dev_ct1 = dpct::get_current_device();\r", |
| 349 | + " sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n", |
341 | 350 | "```\n",
|
342 | 351 | "\n",
|
343 | 352 | "The above code is creating a SYCL queue using dpct helper functions that can be unwrapped using the `dpct/dpct.hpp` header file.\n",
|
|
350 | 359 | "\n",
|
351 | 360 | "Using an `in_order` queue property will not allow kernels with no dependency to overlap execution. Therefore, we will remove the `in_order` queue property and add event-based dependency between kernels.\n",
|
352 | 361 | "\n",
|
353 |
| - "We can replace the SYCL queue creation with the following code:\n", |
| 362 | + "We can replace the SYCL queue creation with the following cod to make it out of order queue:\n", |
354 | 363 | "\n",
|
355 | 364 | "```cpp\n",
|
356 | 365 | "sycl::queue q_ct1;\n",
|
| 366 | + "\n", |
| 367 | + "OR\n", |
| 368 | + "\n", |
| 369 | + "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n", |
| 370 | + "sycl::queue &q_ct1 = dev_ct1.out_of_order_queue();\n", |
357 | 371 | "```\n",
|
358 | 372 | "\n",
|
| 373 | + "\n", |
359 | 374 | "This will create a queue with default device selection and allow kernels to overlap.\n",
|
360 | 375 | "\n",
|
361 | 376 | "The next step is to add kernel dependency. From the code above we can enable the two `memcpy` kernel submissions to overlap and then add dependency for the actual kernel that does the vector add. We will also add a dependency to the final `memcpy` kernel to copy back the results.\n",
|
|
368 | 383 | "//\n",
|
369 | 384 | "// SPDX-License-Identifier: MIT\n",
|
370 | 385 | "// =============================================================\n",
|
371 |
| - "#include <sycl/sycl.hpp>\n", |
| 386 | + "#include <sycl/sycl.hpp\n", |
| 387 | + "#include <dpct/dpct.hpp>>\n", |
372 | 388 | "#include <iostream>\n",
|
373 | 389 | "#include <vector>\n",
|
374 | 390 | "#define N 16\n",
|
|
383 | 399 | "\n",
|
384 | 400 | "int main()\n",
|
385 | 401 | "{\n",
|
386 |
| - " // sycl queue with out of order execution allowed\n", |
387 |
| - " sycl::queue q_ct1;\n", |
388 |
| - " std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n", |
| 402 | + " // sycl queue with out of order execution allowed dpct::device_ext &dev_ct1 = dpct::get_current_device();\r\n", |
| 403 | + " sycl::queue &q_ct1 = dev_ct1out_ofn_order_queue();\r\n", |
| 404 | + " //# Print device name\r\n", |
| 405 | + " dpct::device_info dev;\r\n", |
| 406 | + " dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\r\n", |
| 407 | + " std::cout << \"Device: \" << dev.get_name() << \"\\n\";;\n", |
389 | 408 | "\n",
|
390 | 409 | " //# Initialize vectors on host\n",
|
391 | 410 | " float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
|
|
441 | 460 | "metadata": {},
|
442 | 461 | "outputs": [],
|
443 | 462 | "source": [
|
444 |
| - "! ./q.sh run_vector_add_optimized.sh" |
| 463 | + "! ./q.sh run_sycl_migrated_optimized.sh" |
445 | 464 | ]
|
446 | 465 | },
|
447 | 466 | {
|
|
0 commit comments