Skip to content

Commit cbe8d6d

Browse files
committed
[SYCL] Example of pure CUDA SYCL application
Signed-off-by: Ruyman Reyes <[email protected]>
1 parent c278fcc commit cbe8d6d

File tree

4 files changed

+279
-0
lines changed

4 files changed

+279
-0
lines changed

example-03/Makefile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
3+
CUDACXX=${SYCL_ROOT}/bin/clang++
4+
5+
SYCL_INCLUDE=${SYCL_ROOT}/include/sycl/
6+
7+
CUDAFLAGS=--cuda-gpu-arch=sm_30
8+
9+
CXXFLAGS=-std=c++17 ${CUDAFLAGS} -I${SYCL_INCLUDE} -g
10+
11+
CUDA_ROOT=/usr/local/cuda/
12+
13+
LIBS=-L${SYCL_ROOT}/include/lib -lOpenCL -lsycl -L${CUDA_ROOT}/lib64 -lcudart
14+
15+
default: vec_add.exe usm_vec_add.exe
16+
17+
vec_add.exe: vec_add.cu
18+
${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
19+
20+
usm_vec_add.exe: vec_add_usm.cu
21+
${CUDACXX} ${CXXFLAGS} $< ${LIBS} -o $@
22+
23+
24+
clean:
25+
rm vec_add.exe usm_vec_add.exe

example-03/README.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
Example 03: Calling CUDA kernels from SYCL
2+
===============================
3+
4+
In this example, we re-use the trivial SYCL kernel we used on Example 1,
5+
but instead of writing the SYCL variant, we will keep the original CUDA
6+
kernel, only replacing the CUDA Runtime calls with the SYCL API.
7+
8+
This variant uses buffer and accessor syntax, which is more verbose but allows
9+
the creation of the implicit DAG.
10+
An USM variant is presented for exposition only, support for USM in CUDA is
11+
unstable at the time of writting.
12+
13+
Pre-requisites
14+
---------------
15+
16+
You would need an installation of DPC++ with CUDA support,
17+
see [Getting Started Guide](https://github.com/intel/llvm/doc/GetStartedWithSYCLCompiler.md)
18+
for details on how to build it.
19+
20+
The example is built using Makefiles, since there is no support yet on
21+
a release of CMake for changing the CUDA compiler from nvcc.
22+
23+
Building the example
24+
---------------------
25+
26+
```sh
27+
$ SYCL_ROOT=/path/to/dpcpp make
28+
```
29+
30+
This compiles the SYCL code with the LLVM CUDA support, and generates
31+
two binaries.
32+
NVCC is not used, but the CUDA device libraries need to be available on
33+
/usr/local/cuda/lib64/ for linking to the device code.
34+
35+
NVCC compiler does not support some of the advanced C++17 syntax used on the
36+
SYCL Runtime headers.
37+
38+
Running the example
39+
--------------------
40+
41+
The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
42+
A simple way of running the example is as follows:
43+
44+
```
45+
$ LD_LIBRARY_PATH=/path/to/dpcpp/lib ./vec_add.exe
46+
```
47+
48+
49+
Calling CUDA kernels from SYCL
50+
-------------------------------
51+
52+
Using Codeplay's `interop_task` extension, the example calls a CUDA kernel from
53+
a SYCL application.
54+
Note the example is compiled with the LLVM CUDA compiler, not with the SYCL
55+
compiler, since there are no SYCL kernels on it. It is only required to link
56+
against the SYCL runtime library to ensure the runtime can use the application.
57+
58+
At the time of writing, it is not possible to have both CUDA and SYCL kernels
59+
on the same file.
60+
It is possible to have different files for CUDA and SYCL kernels and call
61+
them together from a main application at runtime.
62+
63+
The example uses an extension to the SYCL interface to interact with the
64+
CUDA Runtime API.
65+
At the time of writing the extension is not public, so only a boolean flag
66+
is passed to the `sycl::context` creation.
67+
68+

example-03/vec_add.cu

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// Original source reproduced unmodified here from:
2+
// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
3+
4+
#include <algorithm>
5+
#include <iostream>
6+
#include <vector>
7+
8+
#include <CL/sycl.hpp>
9+
#include <CL/sycl/backend/cuda.hpp>
10+
11+
class CUDASelector : public sycl::device_selector {
12+
public:
13+
int operator()(const sycl::device &Device) const override {
14+
using namespace sycl::info;
15+
16+
const std::string DriverVersion = Device.get_info<device::driver_version>();
17+
18+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
19+
std::cout << " CUDA device found " << std::endl;
20+
return 1;
21+
};
22+
return -1;
23+
}
24+
};
25+
26+
// CUDA kernel. Each thread takes care of one element of c
27+
__global__ void vecAdd(double *a, double *b, double *c, int n) {
28+
// Get our global thread ID
29+
int id = blockIdx.x * blockDim.x + threadIdx.x;
30+
31+
// Make sure we do not go out of bounds
32+
if (id < n)
33+
c[id] = a[id] + b[id];
34+
}
35+
36+
int main(int argc, char *argv[]) {
37+
using namespace sycl;
38+
// Size of vectors
39+
int n = 100000;
40+
41+
// Create a SYCL context for interoperability with CUDA Runtime API
42+
// This is temporary until the property extension is implemented
43+
const bool UsePrimaryContext = true;
44+
sycl::device dev{CUDASelector().select_device()};
45+
sycl::context myContext{dev, {}, UsePrimaryContext};
46+
sycl::queue myQueue{myContext, dev};
47+
48+
{
49+
buffer<double> bA{range<1>(n)};
50+
buffer<double> bB{range<1>(n)};
51+
buffer<double> bC{range<1>(n)};
52+
53+
{
54+
auto h_a = bA.get_access<access::mode::write>();
55+
auto h_b = bB.get_access<access::mode::write>();
56+
57+
// Initialize vectors on host
58+
for (int i = 0; i < n; i++) {
59+
h_a[i] = sin(i) * sin(i);
60+
h_b[i] = cos(i) * cos(i);
61+
}
62+
}
63+
64+
// Dispatch a command group with all the dependencies
65+
myQueue.submit([&](handler& h) {
66+
auto accA = bA.get_access<access::mode::read>(h);
67+
auto accB = bB.get_access<access::mode::read>(h);
68+
auto accC = bC.get_access<access::mode::write>(h);
69+
70+
h.interop_task([=](interop_handler ih) {
71+
auto d_a = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
72+
auto d_b = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
73+
auto d_c = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
74+
75+
int blockSize, gridSize;
76+
// Number of threads in each thread block
77+
blockSize = 1024;
78+
// Number of thread blocks in grid
79+
gridSize = (int)ceil((float)n / blockSize);
80+
// Call the CUDA kernel directly from SYCL
81+
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
82+
});
83+
});
84+
85+
{
86+
auto h_c = bC.get_access<access::mode::read>();
87+
// Sum up vector c and print result divided by n, this should equal 1 within
88+
// error
89+
double sum = 0;
90+
for (int i = 0; i < n; i++)
91+
sum += h_c[i];
92+
printf("final result: %f\n", sum / n);
93+
}
94+
}
95+
96+
97+
return 0;
98+
}

example-03/vec_add_usm.cu

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
// Original source reproduced unmodified here from:
2+
// https://github.com/olcf/vector_addition_tutorials/blob/master/CUDA/vecAdd.cu
3+
4+
#include <CL/sycl.hpp>
5+
#include <CL/sycl/backend/cuda.hpp>
6+
7+
class CUDASelector : public sycl::device_selector {
8+
public:
9+
int operator()(const sycl::device &Device) const override {
10+
using namespace sycl::info;
11+
12+
const std::string DriverVersion = Device.get_info<device::driver_version>();
13+
14+
if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
15+
std::cout << " CUDA device found " << std::endl;
16+
return 1;
17+
};
18+
return -1;
19+
}
20+
};
21+
22+
23+
// CUDA kernel. Each thread takes care of one element of c
24+
__global__ void vecAdd(double *a, double *b, double *c, int n) {
25+
// Get our global thread ID
26+
int id = blockIdx.x * blockDim.x + threadIdx.x;
27+
28+
// Make sure we do not go out of bounds
29+
if (id < n)
30+
c[id] = a[id] + b[id];
31+
}
32+
33+
int main(int argc, char *argv[]) {
34+
using namespace sycl;
35+
// Size of vectors
36+
int n = 100000;
37+
38+
// Size, in bytes, of each vector
39+
size_t bytes = n * sizeof(double);
40+
41+
// Create a SYCL context for interoperability with CUDA Runtime API
42+
// This is temporary until the property extension is implemented
43+
const bool UsePrimaryContext = true;
44+
sycl::device dev{CUDASelector().select_device()};
45+
sycl::context myContext{dev, {}, UsePrimaryContext};
46+
sycl::queue myQueue{myContext, dev};
47+
48+
// Allocate memory for each vector on host
49+
double* d_a = (double *)malloc_shared(bytes, myQueue);
50+
double* d_b = (double *)malloc_shared(bytes, myQueue);
51+
double* d_c = (double *)malloc_shared(bytes, myQueue);
52+
53+
// Initialize vectors on host
54+
for (int i = 0; i < n; i++) {
55+
d_a[i] = sin(i) * sin(i);
56+
d_b[i] = cos(i) * cos(i);
57+
}
58+
59+
myQueue.submit([&](handler& h) {
60+
h.interop_task([=](interop_handler ih) {
61+
int blockSize, gridSize;
62+
63+
// Number of threads in each thread block
64+
blockSize = 1024;
65+
66+
// Number of thread blocks in grid
67+
gridSize = (int)ceil((float)n / blockSize);
68+
69+
// Execute the kernel
70+
vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
71+
});
72+
});
73+
74+
myQueue.wait();
75+
76+
// Sum up vector c and print result divided by n, this should equal 1 within
77+
// error
78+
double sum = 0;
79+
for (int i = 0; i < n; i++)
80+
sum += d_c[i];
81+
printf("final result: %f\n", sum / n);
82+
83+
sycl::free(d_a, myContext);
84+
sycl::free(d_b, myContext);
85+
sycl::free(d_c, myContext);
86+
87+
return 0;
88+
}

0 commit comments

Comments
 (0)