Skip to content

Commit 3385824

Browse files
authored
updated modules with latest SYCLomatic migration (#2324)
1 parent ee60662 commit 3385824

File tree

378 files changed

+287986
-16041
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

378 files changed

+287986
-16041
lines changed

DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/00_SYCL_Migration_Introduction/00_SYCL_Migration_Introduction.ipynb

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@
204204
"\n",
205205
"#### dpct/dpct.hpp header file\n",
206206
"\n",
207-
"The migrated code will use the header file `dpct/dpct.hpp` that has helper functions, which is a wrapper for SYCL calls. The SYCLomatic (`c2s`) option `--use-custom-helper=api` used during migration will include all the helper functions used in migrated code in a folder called `include` under the output folder. The header files with all helper functions are also available in the `include` folder of the SYCLomatic installation.\n",
207+
"The migrated code will use the header file `dpct/dpct.hpp` that has helper functions, which is a wrapper for SYCL calls. The SYCLomatic (`c2s`) option `--gen-helper-function` used during migration will include all the helper functions in a folder called `include` under the output folder. The header files with all helper functions are also available in the `include` folder of the SYCLomatic installation.\n",
208208
"\n",
209209
"#### In order queue property\n",
210210
"\n",
@@ -214,7 +214,7 @@
214214
"\n",
215215
"```cpp\n",
216216
" dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
217-
" sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
217+
" sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
218218
"```\n",
219219
"\n",
220220
"The above code is using `dpct` helper functions to create a `sycl::queue` with the in_order queue property. The above code can be re-written without the `dpct` helper function, as show below:\n",
@@ -227,6 +227,12 @@
227227
"```cpp\n",
228228
" sycl::queue q_ct1;\n",
229229
"```\n",
230+
"OR \n",
231+
"\n",
232+
"```cpp\n",
233+
" dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
234+
" sycl::queue &q_ct1 = dev_ct1.out_of_order_queue();\n",
235+
"```\n",
230236
"\n",
231237
"Note that removing the `in_order` queue property may result in data race conditions if there are any data dependencies between the kernels. You may have to analyze the kernel code and add event-based dependencies where necessary.\n",
232238
"\n",
@@ -262,9 +268,9 @@
262268
],
263269
"metadata": {
264270
"kernelspec": {
265-
"display_name": "Python 3 (Intel® oneAPI 2023.2)",
271+
"display_name": "Python 3 (ipykernel)",
266272
"language": "python",
267-
"name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
273+
"name": "python3"
268274
},
269275
"language_info": {
270276
"codemirror_mode": {
@@ -276,7 +282,7 @@
276282
"name": "python",
277283
"nbconvert_exporter": "python",
278284
"pygments_lexer": "ipython3",
279-
"version": "3.9.16"
285+
"version": "3.11.5"
280286
}
281287
},
282288
"nbformat": 4,

DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/01_SYCL_Migration_Simple_VectorAdd.ipynb

Lines changed: 51 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -70,19 +70,24 @@
7070
"\n",
7171
"```cpp\n",
7272
"\n",
73-
"#include <cuda.h>\n",
74-
"#include <iostream>\n",
75-
"#include <vector>\n",
76-
"#define N 16\n",
77-
"\n",
78-
"//# kernel code to perform VectorAdd on GPU\n",
79-
"__global__ void VectorAddKernel(float* A, float* B, float* C)\n",
80-
"{\n",
81-
" C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\n",
82-
"}\n",
83-
"\n",
84-
"int main()\n",
85-
"{\n",
73+
"#include <cuda.h>\r\n",
74+
"#include <iostream>\r\n",
75+
"#include <vector>\r\n",
76+
"#define N 16\r\n",
77+
"\r\n",
78+
"//# kernel code to perform VectorAdd on GPU\r\n",
79+
"__global__ void VectorAddKernel(float* A, float* B, float* C)\r\n",
80+
"{\r\n",
81+
" C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\r\n",
82+
"}\r\n",
83+
"\r\n",
84+
"int main()\r\n",
85+
"{\r\n",
86+
" //# Print device name\r\n",
87+
" cudaDeviceProp dev;\r\n",
88+
" cudaGetDeviceProperties(&dev, 0);\r\n",
89+
" std::cout << \"Device: \" << d\n",
90+
"ev.name << \"\\n\";\n",
8691
" //# Initialize vectors on host\n",
8792
" float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
8893
" float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};\n",
@@ -210,8 +215,11 @@
210215
"int main()\n",
211216
"{\n",
212217
" dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
213-
" sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
214-
" std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n",
218+
" sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
219+
" //# Print device name\n",
220+
" dpct::device_info dev;\n",
221+
" dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\n",
222+
" std::cout << \"Device: \" << dev.get_name() << \"\\n\";\n",
215223
"\n",
216224
" //# Initialize vectors on host\n",
217225
" float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
@@ -226,7 +234,7 @@
226234
"\n",
227235
" //# copy vector data from host to device\n",
228236
" q_ct1.memcpy(d_A, A, N * sizeof(float));\n",
229-
" q_ct1.memcpy(d_B, B, N * sizeof(float)).wait();\n",
237+
" q_ct1.memcpy(d_B, B, N * sizeof(float));\n",
230238
"\n",
231239
" //# sumbit task to compute VectorAdd on device\n",
232240
" q_ct1.parallel_for(\n",
@@ -243,11 +251,12 @@
243251
" std::cout << \"\\n\";\n",
244252
"\n",
245253
" //# free allocation on device\n",
246-
" sycl::free(d_A, q_ct1);\n",
247-
" sycl::free(d_B, q_ct1);\n",
248-
" sycl::free(d_C, q_ct1);\n",
254+
" dpct::dpct_free(d_A, q_ct1);\n",
255+
" dpct::dpct_free(d_B, q_ct1);\n",
256+
" dpct::dpct_free(d_C, q_ct1);\n",
249257
" return 0;\n",
250258
"}\n",
259+
"\n",
251260
"```\n",
252261
"\n",
253262
"The migrated SYCL code can be compiled using the following command in terminal:\n",
@@ -274,7 +283,7 @@
274283
"metadata": {},
275284
"outputs": [],
276285
"source": [
277-
"! ./q.sh run_vector_add.sh"
286+
"! ./q.sh run_sycl_migrated.sh"
278287
]
279288
},
280289
{
@@ -288,10 +297,10 @@
288297
"\n",
289298
"| Functionality|CUDA|SYCL\n",
290299
"|-|-|-\n",
291-
"| header file|`#include <cuda.h>`|`#include <CL/sycl.hpp>`\n",
300+
"| header file|`#include <cuda.h>`|`#include <sycl/sycl.hpp>`<br>`#include <dpct/dpct.hpp>`\n",
292301
"| Memory allocation on device| `cudaMalloc(&d_A, N*sizeof(float))`| `d_A = sycl::malloc_device<float>(N, q_ct1)`\n",
293302
"| Copy memory between host and device| `cudaMemcpy(d_A, A, N*sizeof(float), cudaMemcpyHostToDevice)`| `q.memcpy(d_A, A, N * sizeof(float))`\n",
294-
" | Free device memory allocation| `cudaFree(d_A)` | `free(d_A, q)`\n",
303+
" | Free device memory allocation| `cudaFree(d_A)` | `dpct::dpct_free(d_A, q)`\n",
295304
"\n",
296305
"The actual kernel function invocation is different. In CUDA, the kernel function is invoked with the execution configuration syntax `<<<1, N>>>>` as follows, specifying 1 block and N threads:\n",
297306
"\n",
@@ -313,8 +322,8 @@
313322
"Another difference is that the SYCL requires creating a SYCL queue with a device selector and other optional properties. The queue is used to submit the command group to execute on the device. The creation of a SYCL queue is necessary and is done as follows in the SYCL migrated code using some helper functions:\n",
314323
"\n",
315324
"```cpp\n",
316-
"dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
317-
"sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
325+
"dpct::device_ext &dev_ct1 = dpct::get_current_device();\r",
326+
" sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
318327
"```\n",
319328
"\n",
320329
"In CUDA, the equivalent is a CUDA stream; if no stream is created in the CUDA code, a default stream is implicitly created.\n"
@@ -336,8 +345,8 @@
336345
"Analyzing the migrated SYCL code, we can see that a SYCL queue is created using the following code:\n",
337346
"\n",
338347
"```cpp\n",
339-
"dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
340-
"sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
348+
"dpct::device_ext &dev_ct1 = dpct::get_current_device();\r",
349+
" sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
341350
"```\n",
342351
"\n",
343352
"The above code is creating a SYCL queue using dpct helper functions that can be unwrapped using the `dpct/dpct.hpp` header file.\n",
@@ -350,12 +359,18 @@
350359
"\n",
351360
"Using an `in_order` queue property will not allow kernels with no dependency to overlap execution. Therefore, we will remove the `in_order` queue property and add event-based dependency between kernels.\n",
352361
"\n",
353-
"We can replace the SYCL queue creation with the following code:\n",
362+
"We can replace the SYCL queue creation with the following cod to make it out of order queue:\n",
354363
"\n",
355364
"```cpp\n",
356365
"sycl::queue q_ct1;\n",
366+
"\n",
367+
"OR\n",
368+
"\n",
369+
"dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
370+
"sycl::queue &q_ct1 = dev_ct1.out_of_order_queue();\n",
357371
"```\n",
358372
"\n",
373+
"\n",
359374
"This will create a queue with default device selection and allow kernels to overlap.\n",
360375
"\n",
361376
"The next step is to add kernel dependency. From the code above we can enable the two `memcpy` kernel submissions to overlap and then add dependency for the actual kernel that does the vector add. We will also add a dependency to the final `memcpy` kernel to copy back the results.\n",
@@ -368,7 +383,8 @@
368383
"//\n",
369384
"// SPDX-License-Identifier: MIT\n",
370385
"// =============================================================\n",
371-
"#include <sycl/sycl.hpp>\n",
386+
"#include <sycl/sycl.hpp\n",
387+
"#include <dpct/dpct.hpp>>\n",
372388
"#include <iostream>\n",
373389
"#include <vector>\n",
374390
"#define N 16\n",
@@ -383,9 +399,12 @@
383399
"\n",
384400
"int main()\n",
385401
"{\n",
386-
" // sycl queue with out of order execution allowed\n",
387-
" sycl::queue q_ct1;\n",
388-
" std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n",
402+
" // sycl queue with out of order execution allowed dpct::device_ext &dev_ct1 = dpct::get_current_device();\r\n",
403+
" sycl::queue &q_ct1 = dev_ct1out_ofn_order_queue();\r\n",
404+
" //# Print device name\r\n",
405+
" dpct::device_info dev;\r\n",
406+
" dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\r\n",
407+
" std::cout << \"Device: \" << dev.get_name() << \"\\n\";;\n",
389408
"\n",
390409
" //# Initialize vectors on host\n",
391410
" float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
@@ -441,7 +460,7 @@
441460
"metadata": {},
442461
"outputs": [],
443462
"source": [
444-
"! ./q.sh run_vector_add_optimized.sh"
463+
"! ./q.sh run_sycl_migrated_optimized.sh"
445464
]
446465
},
447466
{

DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/Readme.md

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
|:--- |:---
77
| OS | Linux* Ubuntu 18.04, 20 Windows* 10
88
| Hardware | Skylake with GEN9 or newer
9-
| Software | Intel&reg; oneAPI DPC++ Compiler, Jupyter Notebooks, Intel Devcloud
9+
| Software | Intel&reg; oneAPI DPC++ Compiler, Jupyter Notebooks, Intel Developer Cloud
1010

1111
## Purpose
1212
The hands-on exercises in this notebook show how to implement migrate CUDA source to SYCL source using SYCLomatic Tool
@@ -18,11 +18,8 @@ Third party program Licenses can be found here: [third-party-programs.txt](https
1818

1919
## Install Directions
2020

21-
The Jupyter notebooks are tested and can be run on Intel Devcloud.
22-
Below are the steps to access these Jupyter notebooks on Intel Devcloud
23-
1. Register on [Intel Devcloud](https://devcloud.intel.com/oneapi)
24-
2. Go to the "Terminal" in the Intel Devcloud
25-
3. Type in the below command to download the oneAPI-essentials series notebooks into your Devcloud account
26-
/data/oneapi_workshop/get_jupyter_notebooks.sh
27-
4. Navigate to CUDA_To_SYCL_Migration folder and open the Welcome.ipynb
21+
The Jupyter notebooks are tested and can be run on Intel Developer Cloud.
22+
Below are the steps to access these Jupyter notebooks on Intel Developer Cloud
23+
1. Register on [Intel Developer Cloud](https://cloud.intel.com/)
24+
2. Download the Jupyter Notebooks and access the Welcome.ipynb
2825

DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/cuda/vectoradd.cu

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ __global__ void VectorAddKernel(float* A, float* B, float* C)
1717

1818
int main()
1919
{
20+
//# Print device name
21+
cudaDeviceProp dev;
22+
cudaGetDeviceProperties(&dev, 0);
23+
std::cout << "Device: " << dev.name << "\n";
24+
2025
//# Initialize vectors on host
2126
float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2227
float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};

0 commit comments

Comments
 (0)