File tree Expand file tree Collapse file tree 4 files changed +139
-0
lines changed
Expand file tree Collapse file tree 4 files changed +139
-0
lines changed Original file line number Diff line number Diff line change 1+ CXX =clang++
2+ FORT =nvfortran
3+ FFLAGS =-c++libs -cuda
4+ CXXFLAGS =-fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-unnamed-lambda
5+ DPCPP_PATH =/home/ruyman/sycl_workspace/build_dpcpp/install
6+
7+ default : final.exe
8+
9+ saxpy_sycl.so : saxpy.cpp
10+ $(CXX ) $(CXXFLAGS ) -fPIC --shared saxpy.cpp -o saxpy_sycl.so
11+
12+ saxpy_cuf.o : saxpy.cuf
13+ $(FORT ) $(FFLAGS ) -c saxpy.cuf -o saxpy_cuf.o
14+
15+ final.exe : saxpy_cuf.o saxpy_sycl.so
16+ $(FORT ) $(FFLAGS ) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH} /lib/ -lsycl
17+
18+ .PHONY : clean
19+
20+ clean :
21+ rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod
22+
Original file line number Diff line number Diff line change 1+ CUDA Frotran and SYCL integration
2+ ======================================
3+
4+ This directory shows an example of how to call a SYCL function
5+ from a CUDA fortran code.
6+
7+ The SYCL routine is called using the Fortran ISO bindings like
8+ any other C function.
9+
10+ ``` fortran
11+ interface saxpy_sycl
12+ subroutine saxpy_call(x, y, a, N) &
13+ bind(C,name='saxpy_sycl_cuda_wrapper')
14+ implicit none
15+ real :: x(:), y(:)
16+ real, value :: a
17+ integer, value :: N
18+ end subroutine
19+ end interface
20+ ```
21+
22+ The SYCL code implemented in the C++ version of the code works as usual with one minor modification:
23+ Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared.
24+
25+ The following snipped highligts the construction of a SYCL context associated with the Primary context.
26+ To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA
27+ stream, instead of creating a new stream.
28+ It is possible to create a normal stream, just by using the default SYCL queue constructor on the CUDA
29+ context. Said queue will run concurrently (i.e. won't sync) to the main queue.
30+
31+ ``` cpp
32+ sycl::context c{sycl::property::context::cuda::use_primary_context ()};
33+ sycl::queue q{c, c.get_devices()[ 0] , sycl::property::queue::cuda::use_default_stream()};
34+ ```
35+
36+
Original file line number Diff line number Diff line change 1+ #include < iostream>
2+ #include < CL/sycl.hpp>
3+
4+ extern " C" {
5+ void saxpy_sycl_cuda_wrapper (float * x, float * y, float a, int N);
6+ };
7+
8+
9+ void saxpy_sycl_cuda_wrapper (float * x, float * y, float a, int N) {
10+ sycl::context c{sycl::property::context::cuda::use_primary_context ()};
11+ sycl::queue q{c, c.get_devices ()[0 ], sycl::property::queue::cuda::use_default_stream ()};
12+ {
13+ sycl::buffer bX {x, sycl::range<1 >(N)};
14+ sycl::buffer bY {y, sycl::range<1 >(N)};
15+
16+ q.submit ([&](sycl::handler& h) {
17+ auto aX = bX.get_access <sycl::access::mode::read_write>(h);
18+ auto aY = bY.get_access <sycl::access::mode::read_write>(h);
19+ h.parallel_for <class saxpy_kernel >(sycl::range<1 >(N), [=](sycl::id<1 > id) {
20+ if (id[0 ] < N)
21+ aY[id] = aX[id] * a + aY[id];
22+ });
23+ });
24+
25+ q.wait_and_throw ();
26+ }
27+ return ;
28+ }
Original file line number Diff line number Diff line change 1+ module mathOps
2+ contains
3+ attributes(global) subroutine saxpy(x, y, a)
4+ implicit none
5+ real :: x(:), y(:)
6+ real, value :: a
7+ integer :: i, n
8+ n = size(x)
9+ i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
10+ if (i <= n) y(i) = y(i) + a*x(i)
11+ end subroutine saxpy
12+ end module mathOps
13+
14+ program testSaxpy
15+ use mathOps
16+ use cudafor
17+
18+ implicit none
19+
20+ interface saxpy_sycl
21+ subroutine saxpy_call(x, y, a, N) &
22+ bind(C,name='saxpy_sycl_cuda_wrapper')
23+ implicit none
24+ real :: x(:), y(:)
25+ real, value :: a
26+ integer, value :: N
27+ end subroutine
28+ end interface
29+
30+
31+ integer, parameter :: N = 1024
32+ real :: x(N), y(N), a
33+ real, device :: x_d(N), y_d(N)
34+ type(dim3) :: grid, tBlock
35+
36+ tBlock = dim3(256,1,1)
37+ grid = dim3(ceiling(real(N)/tBlock%x),1,1)
38+
39+ write (*,*) 'CUDA version: '
40+ x = 1.0; y = 2.0; a = 2.0
41+ x_d = x
42+ y_d = y
43+ call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
44+ y = y_d
45+ write(*,*) 'Max error: ', maxval(abs(y-4.0))
46+ write(*,*) 'N ', N
47+
48+ write (*,*) 'SYCL version: '
49+ y = 2.0;
50+ call saxpy_call(x, y, a, N);
51+ write(*,*) 'Max error: ', maxval(abs(y-4.0))
52+
53+ end program testSaxpy
You can’t perform that action at this time.
0 commit comments