[SYCL] Example of pure CUDA SYCL application

Ruyman Reyes · Ruyman Reyes · commit cbe8d6d491b8 · 2020-06-03T14:43:50.000Z
Signed-off-by: Ruyman Reyes &lt;ruyman@codeplay.com&gt;
diff --git a/example-03/Makefile b/example-03/Makefile
@@ -0,0 +1,25 @@
+
+
+CUDACXX=${SYCL_ROOT}/bin/clang++
+
+SYCL_INCLUDE=${SYCL_ROOT}/include/sycl/
+
+CUDAFLAGS=--cuda-gpu-arch=sm_30 
+
+CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
+
+CUDA_ROOT=/usr/local/cuda/
+
+LIBS=-L${SYCL_ROOT}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT}/lib64 -lcudart
+
+default: vec_add.exe usm_vec_add.exe
+
+vec_add.exe: vec_add.cu
+	${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
+
+usm_vec_add.exe: vec_add_usm.cu
+	${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
+
+
+clean:
+	rm vec_add.exe usm_vec_add.exe
diff --git a/example-03/README.md b/example-03/README.md
@@ -0,0 +1,68 @@
+Example 03: Calling CUDA kernels from SYCL
+===============================
+
+In this example, we re-use the trivial SYCL kernel we used on Example 1, 
+but instead of writing the SYCL variant, we will keep the original CUDA
+kernel, only replacing the CUDA Runtime calls with the SYCL API.
+
+This variant uses buffer and accessor syntax, which is more verbose but allows
+the creation of the implicit DAG.
+An USM variant is presented for exposition only, support for USM in CUDA is
+unstable at the time of writting.
+
+Pre-requisites
+---------------
+
+You would need an installation of DPC++ with CUDA support, 
+see [Getting Started Guide](https://github.com/intel/llvm/doc/GetStartedWithSYCLCompiler.md) 
+for details on how to build it.
+
+The example is built using Makefiles, since there is no support yet on
+a release of CMake for changing the CUDA compiler from nvcc.
+
+Building the example
+---------------------
+
+```sh
+$ SYCL_ROOT=/path/to/dpcpp  make  
+```
+
+This compiles the SYCL code with the LLVM CUDA support, and generates
+two binaries.
+NVCC is not used, but the CUDA device libraries need to be available on 
+/usr/local/cuda/lib64/ for linking to the device code.
+
+NVCC compiler does not support some of the advanced C++17 syntax used on the
+SYCL Runtime headers.
+
+Running the example
+--------------------
+
+The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
+A simple way of running the example is as follows:
+
+```
+$ LD_LIBRARY_PATH=/path/to/dpcpp/lib  ./vec_add.exe
+```
+
+
+Calling CUDA kernels from SYCL
+-------------------------------
+
+Using Codeplay's `interop_task` extension, the example calls a CUDA kernel from
+a SYCL application.
+Note the example is compiled with the LLVM CUDA compiler, not with the SYCL
+compiler, since there are no SYCL kernels on it. It is only required to link
+against the SYCL runtime library to ensure the runtime can use the application.
+
+At the time of writing, it is not possible to have both CUDA and SYCL kernels
+on the same file.
+It is possible to have different files for CUDA and SYCL kernels and call
+them together from a main application at runtime.
+
+The example uses an extension to the SYCL interface to interact with the
+CUDA Runtime API. 
+At the time of writing the extension is not public, so only a boolean flag
+is passed to the `sycl::context` creation.
+
+
diff --git a/example-03/vec_add.cu b/example-03/vec_add.cu
@@ -0,0 +1,98 @@
+// Original source reproduced unmodified here from: 
+// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found " << std::endl;
+      return 1;
+    };
+    return -1;
+  }
+};
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n) {
+  // Get our global thread ID
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Make sure we do not go out of bounds
+  if (id < n)
+    c[id] = a[id] + b[id];
+}
+
+int main(int argc, char *argv[]) {
+  using namespace sycl;
+  // Size of vectors
+  int n = 100000;
+
+  // Create a SYCL context for interoperability with CUDA Runtime API
+  // This is temporary until the property extension is implemented
+  const bool UsePrimaryContext = true;
+  sycl::device dev{CUDASelector().select_device()};
+  sycl::context myContext{dev, {}, UsePrimaryContext};
+  sycl::queue myQueue{myContext, dev};
+
+  {
+    buffer<double> bA{range<1>(n)};
+    buffer<double> bB{range<1>(n)};
+    buffer<double> bC{range<1>(n)};
+
+    {
+      auto h_a = bA.get_access<access::mode::write>();
+      auto h_b = bB.get_access<access::mode::write>();
+
+      // Initialize vectors on host
+      for (int i = 0; i < n; i++) {
+        h_a[i] = sin(i) * sin(i);
+        h_b[i] = cos(i) * cos(i);
+      }
+    }
+
+    // Dispatch a command group with all the dependencies
+    myQueue.submit([&](handler& h) {
+      auto accA = bA.get_access<access::mode::read>(h);
+      auto accB = bB.get_access<access::mode::read>(h);
+      auto accC = bC.get_access<access::mode::write>(h);
+
+      h.interop_task([=](interop_handler ih) {
+        auto d_a = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
+        auto d_b = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
+        auto d_c = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
+
+        int blockSize, gridSize;
+        // Number of threads in each thread block
+        blockSize = 1024;
+        // Number of thread blocks in grid
+        gridSize = (int)ceil((float)n / blockSize);
+        // Call the CUDA kernel directly from SYCL
+        vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+      });
+    });
+
+    {
+     auto h_c = bC.get_access<access::mode::read>();
+     // Sum up vector c and print result divided by n, this should equal 1 within
+     // error
+     double sum = 0;
+      for (int i = 0; i < n; i++)
+        sum += h_c[i];
+      printf("final result: %f\n", sum / n);
+    }
+  }
+
+
+  return 0;
+}
diff --git a/example-03/vec_add_usm.cu b/example-03/vec_add_usm.cu
@@ -0,0 +1,88 @@
+// Original source reproduced unmodified here from: 
+// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/cuda.hpp>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &Device) const override {
+    using namespace sycl::info;
+
+    const std::string DriverVersion = Device.get_info<device::driver_version>();
+
+    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+      std::cout << " CUDA device found " << std::endl;
+      return 1;
+    };
+    return -1;
+  }
+};
+
+
+// CUDA kernel. Each thread takes care of one element of c
+__global__ void vecAdd(double *a, double *b, double *c, int n) {
+  // Get our global thread ID
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Make sure we do not go out of bounds
+  if (id < n)
+    c[id] = a[id] + b[id];
+}
+
+int main(int argc, char *argv[]) {
+  using namespace sycl;
+  // Size of vectors
+  int n = 100000;
+
+  // Size, in bytes, of each vector
+  size_t bytes = n * sizeof(double);
+
+  // Create a SYCL context for interoperability with CUDA Runtime API
+  // This is temporary until the property extension is implemented
+  const bool UsePrimaryContext = true;
+  sycl::device dev{CUDASelector().select_device()};
+  sycl::context myContext{dev, {}, UsePrimaryContext};
+  sycl::queue myQueue{myContext, dev};
+
+  // Allocate memory for each vector on host
+  double* d_a = (double *)malloc_shared(bytes, myQueue);
+  double* d_b = (double *)malloc_shared(bytes, myQueue);
+  double* d_c = (double *)malloc_shared(bytes, myQueue);
+
+  // Initialize vectors on host
+  for (int i = 0; i < n; i++) {
+    d_a[i] = sin(i) * sin(i);
+    d_b[i] = cos(i) * cos(i);
+  }
+
+  myQueue.submit([&](handler& h) {
+      h.interop_task([=](interop_handler ih) {
+        int blockSize, gridSize;
+
+        // Number of threads in each thread block
+        blockSize = 1024;
+
+        // Number of thread blocks in grid
+        gridSize = (int)ceil((float)n / blockSize);
+
+        // Execute the kernel
+        vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+        });
+  });
+
+  myQueue.wait();
+
+  // Sum up vector c and print result divided by n, this should equal 1 within
+  // error
+  double sum = 0;
+  for (int i = 0; i < n; i++)
+    sum += d_c[i];
+  printf("final result: %f\n", sum / n);
+
+  sycl::free(d_a, myContext);
+  sycl::free(d_b, myContext);
+  sycl::free(d_c, myContext);
+
+  return 0;
+}