oneapi-src
diff --git a/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/00_SYCL_Migration_Introduction/00_SYCL_Migration_Introduction.ipynb
Lines changed: 11 additions & 5 deletions b/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/00_SYCL_Migration_Introduction/00_SYCL_Migration_Introduction.ipynb
Lines changed: 11 additions & 5 deletions
diff --git a/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/01_SYCL_Migration_Simple_VectorAdd.ipynb
Lines changed: 51 additions & 32 deletions b/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/01_SYCL_Migration_Simple_VectorAdd.ipynb
Lines changed: 51 additions & 32 deletions
diff --git a/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/Readme.md
Lines changed: 5 additions & 8 deletions b/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/Readme.md
Lines changed: 5 additions & 8 deletions
diff --git a/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/cuda/vectoradd.cu
Lines changed: 5 additions & 0 deletions b/‎DirectProgramming/C++SYCL/Jupyter/cuda-to-sycl-migration-training/01_SYCL_Migration_Simple_VectorAdd/cuda/vectoradd.cu
Lines changed: 5 additions & 0 deletions
@@ -204,7 +204,7 @@
     "\n",
     "#### dpct/dpct.hpp header file\n",
     "\n",
-    "The migrated code will use the header file `dpct/dpct.hpp` that has helper functions, which is a wrapper for SYCL calls. The SYCLomatic (`c2s`) option `--use-custom-helper=api` used during migration will include all the helper functions used in migrated code in a folder called `include` under the output folder. The header files with all helper functions are also available in the `include` folder of the SYCLomatic installation.\n",
+    "The migrated code will use the header file `dpct/dpct.hpp` that has helper functions, which is a wrapper for SYCL calls. The SYCLomatic (`c2s`) option `--gen-helper-function` used during migration will include all the helper functions in a folder called `include` under the output folder. The header files with all helper functions are also available in the `include` folder of the SYCLomatic installation.\n",
     "\n",
     "#### In order queue property\n",
     "\n",
@@ -214,7 +214,7 @@
     "\n",
     "```cpp\n",
     "  dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
-    "  sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
+    "  sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
     "```\n",
     "\n",
     "The above code is using `dpct` helper functions to create a `sycl::queue` with the in_order queue property. The above code can be re-written without the `dpct` helper function, as show below:\n",
@@ -227,6 +227,12 @@
     "```cpp\n",
     "   sycl::queue q_ct1;\n",
     "```\n",
+    "OR \n",
+    "\n",
+    "```cpp\n",
+    "  dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
+    "  sycl::queue &q_ct1 = dev_ct1.out_of_order_queue();\n",
+    "```\n",
     "\n",
     "Note that removing the `in_order` queue property may result in data race conditions if there are any data dependencies between the kernels. You may have to analyze the kernel code and add event-based dependencies where necessary.\n",
     "\n",
@@ -262,9 +268,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (Intel® oneAPI 2023.2)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "c009-intel_distribution_of_python_3_oneapi-beta05-python"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -276,7 +282,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,
 
@@ -70,19 +70,24 @@
     "\n",
     "```cpp\n",
     "\n",
-    "#include <cuda.h>\n",
-    "#include <iostream>\n",
-    "#include <vector>\n",
-    "#define N 16\n",
-    "\n",
-    "//# kernel code to perform VectorAdd on GPU\n",
-    "__global__ void VectorAddKernel(float* A, float* B, float* C)\n",
-    "{\n",
-    "        C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\n",
-    "}\n",
-    "\n",
-    "int main()\n",
-    "{\n",
+    "#include <cuda.h>\r\n",
+    "#include <iostream>\r\n",
+    "#include <vector>\r\n",
+    "#define N 16\r\n",
+    "\r\n",
+    "//# kernel code to perform VectorAdd on GPU\r\n",
+    "__global__ void VectorAddKernel(float* A, float* B, float* C)\r\n",
+    "{\r\n",
+    "        C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x];\r\n",
+    "}\r\n",
+    "\r\n",
+    "int main()\r\n",
+    "{\r\n",
+    "        //# Print device name\r\n",
+    "        cudaDeviceProp dev;\r\n",
+    "        cudaGetDeviceProperties(&dev, 0);\r\n",
+    "        std::cout << \"Device: \" << d\n",
+    "ev.name << \"\\n\";\n",
     "        //# Initialize vectors on host\n",
     "        float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
     "        float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};\n",
@@ -210,8 +215,11 @@
     "int main()\n",
     "{\n",
     "        dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
-    "        sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
-    "        std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n",
+    "        sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
+    "        //# Print device name\n",
+    "        dpct::device_info dev;\n",
+    "        dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\n",
+    "        std::cout << \"Device: \" << dev.get_name() << \"\\n\";\n",
     "\n",
     "        //# Initialize vectors on host\n",
     "        float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
@@ -226,7 +234,7 @@
     "\n",
     "        //# copy vector data from host to device\n",
     "        q_ct1.memcpy(d_A, A, N * sizeof(float));\n",
-    "        q_ct1.memcpy(d_B, B, N * sizeof(float)).wait();\n",
+    "        q_ct1.memcpy(d_B, B, N * sizeof(float));\n",
     "\n",
     "        //# sumbit task to compute VectorAdd on device\n",
     "        q_ct1.parallel_for(\n",
@@ -243,11 +251,12 @@
     "        std::cout << \"\\n\";\n",
     "\n",
     "        //# free allocation on device\n",
-    "        sycl::free(d_A, q_ct1);\n",
-    "        sycl::free(d_B, q_ct1);\n",
-    "        sycl::free(d_C, q_ct1);\n",
+    "        dpct::dpct_free(d_A, q_ct1);\n",
+    "        dpct::dpct_free(d_B, q_ct1);\n",
+    "        dpct::dpct_free(d_C, q_ct1);\n",
     "        return 0;\n",
     "}\n",
+    "\n",
     "```\n",
     "\n",
     "The migrated SYCL code can be compiled using the following command in terminal:\n",
@@ -274,7 +283,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ./q.sh run_vector_add.sh"
+    "! ./q.sh run_sycl_migrated.sh"
    ]
   },
   {
@@ -288,10 +297,10 @@
     "\n",
     "| Functionality|CUDA|SYCL\n",
     "|-|-|-\n",
-    "| header file|`#include <cuda.h>`|`#include <CL/sycl.hpp>`\n",
+    "| header file|`#include <cuda.h>`|`#include <sycl/sycl.hpp>`<br>`#include <dpct/dpct.hpp>`\n",
     "| Memory allocation on device| `cudaMalloc(&d_A, N*sizeof(float))`| `d_A = sycl::malloc_device<float>(N, q_ct1)`\n",
     "| Copy memory between host and device| `cudaMemcpy(d_A, A, N*sizeof(float), cudaMemcpyHostToDevice)`| `q.memcpy(d_A, A, N * sizeof(float))`\n",
-    " | Free device memory allocation| `cudaFree(d_A)` | `free(d_A, q)`\n",
+    " | Free device memory allocation| `cudaFree(d_A)` | `dpct::dpct_free(d_A, q)`\n",
     "\n",
     "The actual kernel function invocation is different. In CUDA, the kernel function is invoked with the execution configuration syntax `<<<1, N>>>>` as follows, specifying 1 block and N threads:\n",
     "\n",
@@ -313,8 +322,8 @@
     "Another difference is that the SYCL requires creating a SYCL queue with a device selector and other optional properties. The queue is used to submit the command group to execute on the device. The creation of a SYCL queue is necessary and is done as follows in the SYCL migrated code using some helper functions:\n",
     "\n",
     "```cpp\n",
-    "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
-    "sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
+    "dpct::device_ext &dev_ct1 = dpct::get_current_device();\r",
+    " sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
     "```\n",
     "\n",
     "In CUDA, the equivalent is a CUDA stream; if no stream is created in the CUDA code, a default stream is implicitly created.\n"
@@ -336,8 +345,8 @@
     "Analyzing the migrated SYCL code, we can see that a SYCL queue is created using the following code:\n",
     "\n",
     "```cpp\n",
-    "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
-    "sycl::queue &q_ct1 = dev_ct1.default_queue();\n",
+    "dpct::device_ext &dev_ct1 = dpct::get_current_device();\r",
+    " sycl::queue &q_ct1 = dev_ct1.in_order_queue();\n",
     "```\n",
     "\n",
     "The above code is creating a SYCL queue using dpct helper functions that can be unwrapped using the `dpct/dpct.hpp` header file.\n",
@@ -350,12 +359,18 @@
     "\n",
     "Using an `in_order` queue property will not allow kernels with no dependency to overlap execution. Therefore, we will remove the `in_order` queue property and add event-based dependency between kernels.\n",
     "\n",
-    "We can replace the SYCL queue creation with the following code:\n",
+    "We can replace the SYCL queue creation with the following cod to make it out of order queue:\n",
     "\n",
     "```cpp\n",
     "sycl::queue q_ct1;\n",
+    "\n",
+    "OR\n",
+    "\n",
+    "dpct::device_ext &dev_ct1 = dpct::get_current_device();\n",
+    "sycl::queue &q_ct1 = dev_ct1.out_of_order_queue();\n",
     "```\n",
     "\n",
+    "\n",
     "This will create a queue with default device selection and allow kernels to overlap.\n",
     "\n",
     "The next step is to add kernel dependency. From the code above we can enable the two `memcpy` kernel submissions to overlap and then add dependency for the actual kernel that does the vector add. We will also add a dependency to the final `memcpy` kernel to copy back the results.\n",
@@ -368,7 +383,8 @@
     "//\n",
     "// SPDX-License-Identifier: MIT\n",
     "// =============================================================\n",
-    "#include <sycl/sycl.hpp>\n",
+    "#include <sycl/sycl.hpp\n",
+    "#include <dpct/dpct.hpp>>\n",
     "#include <iostream>\n",
     "#include <vector>\n",
     "#define N 16\n",
@@ -383,9 +399,12 @@
     "\n",
     "int main()\n",
     "{\n",
-    "        // sycl queue with out of order execution allowed\n",
-    "        sycl::queue q_ct1;\n",
-    "        std::cout << \"Device: \" << q_ct1.get_device().get_info<sycl::info::device::name>() << \"\\n\";\n",
+    "        // sycl queue with out of order execution allowed        dpct::device_ext &dev_ct1 = dpct::get_current_device();\r\n",
+    "        sycl::queue &q_ct1 = dev_ct1out_ofn_order_queue();\r\n",
+    "        //# Print device name\r\n",
+    "        dpct::device_info dev;\r\n",
+    "        dpct::get_device_info(dev, dpct::dev_mgr::instance().get_device(0));\r\n",
+    "        std::cout << \"Device: \" << dev.get_name() << \"\\n\";;\n",
     "\n",
     "        //# Initialize vectors on host\n",
     "        float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};\n",
@@ -441,7 +460,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! ./q.sh run_vector_add_optimized.sh"
+    "! ./q.sh run_sycl_migrated_optimized.sh"
    ]
   },
   {
 
@@ -6,7 +6,7 @@
 |:---                               |:---
 | OS                                | Linux* Ubuntu 18.04, 20 Windows* 10
 | Hardware                          | Skylake with GEN9 or newer
-| Software                          | Intel&reg; oneAPI DPC++ Compiler, Jupyter Notebooks, Intel Devcloud
+| Software                          | Intel&reg; oneAPI DPC++ Compiler, Jupyter Notebooks, Intel Developer Cloud
 
 ## Purpose
 The hands-on exercises in this notebook show how to implement migrate CUDA source to SYCL source using SYCLomatic Tool
@@ -18,11 +18,8 @@ Third party program Licenses can be found here: [third-party-programs.txt](https
 
 ## Install Directions
 
-The Jupyter notebooks are tested and can be run on Intel Devcloud.
-Below are the steps to access these Jupyter notebooks on Intel Devcloud
-1. Register on [Intel Devcloud](https://devcloud.intel.com/oneapi)
-2. Go to the "Terminal" in the Intel Devcloud
-3. Type in the below command to download the oneAPI-essentials series notebooks into your Devcloud account
-    /data/oneapi_workshop/get_jupyter_notebooks.sh
-4. Navigate to CUDA_To_SYCL_Migration folder and open the Welcome.ipynb
+The Jupyter notebooks are tested and can be run on Intel Developer Cloud.
+Below are the steps to access these Jupyter notebooks on Intel Developer Cloud
+1. Register on [Intel Developer Cloud](https://cloud.intel.com/)
+2. Download the Jupyter Notebooks and access the Welcome.ipynb
 
@@ -17,6 +17,11 @@ __global__ void VectorAddKernel(float* A, float* B, float* C)
 
 int main()
 {
+        //# Print device name
+        cudaDeviceProp dev;
+        cudaGetDeviceProperties(&dev, 0);
+        std::cout << "Device: " << dev.name << "\n";
+
         //# Initialize vectors on host
         float A[N] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
         float B[N] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};