codeplaysoftware
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎example-01/CMakeLists.txt‎
Lines changed: 17 additions & 4 deletions b/‎example-01/CMakeLists.txt‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎example-01/build.sh‎
Lines changed: 3 additions & 0 deletions b/‎example-01/build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎example-01/vector_addition.cpp‎
Lines changed: 7 additions & 11 deletions b/‎example-01/vector_addition.cpp‎
Lines changed: 7 additions & 11 deletions
diff --git a/‎example-01/vector_addition.cu‎
Lines changed: 4 additions & 4 deletions b/‎example-01/vector_addition.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎example-01/vector_addition_usm.cpp‎
Lines changed: 100 additions & 0 deletions b/‎example-01/vector_addition_usm.cpp‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎example-02/CMakeLists.txt‎
Lines changed: 13 additions & 8 deletions b/‎example-02/CMakeLists.txt‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎example-02/README.md‎
Lines changed: 6 additions & 3 deletions b/‎example-02/README.md‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎example-02/build.sh‎
Lines changed: 3 additions & 0 deletions b/‎example-02/build.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎example-02/sycl_sgemm.cpp‎
Lines changed: 6 additions & 9 deletions b/‎example-02/sycl_sgemm.cpp‎
Lines changed: 6 additions & 9 deletions
@@ -30,3 +30,8 @@
 *.exe
 *.out
 *.app
+
+# Temporaries
+*~
+*#
+*/build
@@ -1,4 +1,7 @@
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+# Don't complain about empty CMAKE_CUDA_ARCHITECTURES
+cmake_policy(SET CMP0104 OLD)
+
 project(cmake_and_cuda LANGUAGES CXX CUDA)
 
 include(CTest)
@@ -8,11 +11,12 @@ if (NOT SYCL_ROOT)
   message(FATAL_ERROR "No SYCL installation detected")
 endif(NOT SYCL_ROOT)
 
-set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
+set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
 set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
-set(SYCL_FLAGS "-fsycl" 
-      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice" 
-      "-fsycl-unnamed-lambda")
+set(SYCL_FLAGS "-fsycl"
+      "-fsycl-targets=nvptx64-nvidia-cuda"
+      "-fsycl-unnamed-lambda"
+      "-Wno-linker-warnings")
 
 # Build the CUDA code
 add_executable(vector_addition vector_addition.cu)
@@ -28,3 +32,12 @@ target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
 target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR})
 target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB})
 
+
+# Build the SYCL (USM) code
+add_executable (sycl_vector_addition_usm vector_addition_usm.cpp)
+target_compile_features(sycl_vector_addition_usm PUBLIC cxx_std_17)
+target_compile_options(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
+target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
+target_include_directories(sycl_vector_addition_usm PUBLIC ${SYCL_INCLUDE_DIR})
+target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_LIB})
+
@@ -0,0 +1,3 @@
+rm -rf build && mkdir  build && cd build
+cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ -DCMAKE_EXPORT_COMPILE_COMMANDS=yes
+make -j 8
@@ -26,20 +26,16 @@
 
 class CUDASelector : public sycl::device_selector {
 public:
-  int operator()(const sycl::device &Device) const override {
-    using namespace sycl::info;
-
-    const std::string DriverVersion = Device.get_info<device::driver_version>();
-
-    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+  int operator()(const sycl::device &device) const override {
+    if(device.get_platform().get_backend() == sycl::backend::cuda){
       std::cout << " CUDA device found " << std::endl;
       return 1;
-    };
-    return -1;
+    } else{
+      return -1;
+    }
   }
 };
 
-class vec_add;
 int main(int argc, char *argv[]) {
   constexpr const size_t N = 100000;
   const sycl::range VecSize{N};
@@ -71,8 +67,8 @@ int main(int argc, char *argv[]) {
     auto b = bufB.get_access<read_t>(h);
     auto c = bufC.get_access<write_t>(h);
 
-    h.parallel_for<vec_add>(VecSize,
-                            [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
+    h.parallel_for(VecSize,
+                   [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
   };
 
   myQueue.submit(cg);
 
@@ -4,6 +4,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <iostream>
 
 // CUDA kernel. Each thread takes care of one element of c
 __global__ void vecAdd(double *a, double *b, double *c, int n) {
@@ -44,9 +45,8 @@ int main(int argc, char *argv[]) {
   cudaMalloc(&d_b, bytes);
   cudaMalloc(&d_c, bytes);
 
-  int i;
   // Initialize vectors on host
-  for (i = 0; i < n; i++) {
+  for (int i = 0; i < n; i++) {
     h_a[i] = sin(i) * sin(i);
     h_b[i] = cos(i) * cos(i);
   }
@@ -72,9 +72,9 @@ int main(int argc, char *argv[]) {
   // Sum up vector c and print result divided by n, this should equal 1 within
   // error
   double sum = 0;
-  for (i = 0; i < n; i++)
+  for (int i = 0; i < n; i++)
     sum += h_c[i];
-  printf("final result: %f\n", sum / n);
+  std::cout << "Sum is : " << sum << std::endl;
 
   // Release device memory
   cudaFree(d_a);
 
@@ -0,0 +1,100 @@
+/**
+ * SYCL FOR CUDA : Vector Addition Example
+ *
+ * Copyright 2020 Codeplay Software Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *
+ * @File: vector_addition.cpp
+ */
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <CL/sycl.hpp>
+
+class CUDASelector : public sycl::device_selector {
+public:
+  int operator()(const sycl::device &device) const override {
+    if(device.get_platform().get_backend() == sycl::backend::cuda){
+      std::cout << " CUDA device found " << std::endl;
+      return 1;
+    } else{
+      return -1;
+    }
+  }
+};
+
+int main(int argc, char *argv[]) {
+  constexpr const size_t n = 100000;
+
+  // Create a sycl queue with our CUDASelector
+  sycl::queue myQueue{CUDASelector()};
+
+  // Host input vectors
+  double *h_a;
+  double *h_b;
+  // Host output vector
+  double *h_c;
+
+  // Device input vectors
+  double *d_a;
+  double *d_b;
+  // Device output vector
+  double *d_c;
+
+  // Size, in bytes, of each vector
+  size_t bytes = n * sizeof(double);
+
+  // Allocate memory for each vector on host
+  h_a = (double *)malloc(bytes);
+  h_b = (double *)malloc(bytes);
+  h_c = (double *)malloc(bytes);
+
+  // Allocate memory for each vector on GPU
+  d_a = sycl::malloc_device<double>(n, myQueue);
+  d_b = sycl::malloc_device<double>(n, myQueue);
+  d_c = sycl::malloc_device<double>(n, myQueue);
+
+  // Initialize vectors on host
+  for (int i = 0; i < n; i++) {
+    h_a[i] = sin(i) * sin(i);
+    h_b[i] = cos(i) * cos(i);
+  }
+
+  myQueue.memcpy(d_a, h_a, bytes).wait();
+  myQueue.memcpy(d_b, h_b, bytes).wait();
+
+  // Command Group creation
+  auto cg = [&](sycl::handler &h) {
+    h.parallel_for(sycl::range(n),
+                   [=](sycl::id<1> i) {
+                     d_c[i] = d_a[i] + d_b[i];
+                   });
+  };
+
+  // Run the kernel defined above
+  myQueue.submit(cg).wait();
+
+  // Copy the result back to host
+  myQueue.memcpy(h_c, d_c, bytes).wait();
+
+  double sum = 0.0f;
+  for (int i = 0; i < n; i++) {
+    sum += h_c[i];
+  }
+  std::cout << "Sum is : " << sum << std::endl;
+
+  return 0;
+}
@@ -1,4 +1,8 @@
 cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+
+# Don't complain about empty CMAKE_CUDA_ARCHITECTURES
+cmake_policy(SET CMP0104 OLD)
+
 project(sycl_cuda_interop LANGUAGES CXX CUDA)
 
 find_package(CUDAToolkit)
@@ -8,19 +12,20 @@ if (NOT SYCL_ROOT)
   message(FATAL_ERROR "No SYCL installation detected")
 endif(NOT SYCL_ROOT)
 
-set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/11.0.0/include/")
+set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
 set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
 set(SYCL_FLAGS "-fsycl" 
-      "-fsycl-targets=nvptx64-nvidia-cuda-sycldevice"
-      "-fsycl-unnamed-lambda")
+      "-fsycl-targets=nvptx64-nvidia-cuda"
+      "-fsycl-unnamed-lambda"
+      "-Wno-linker-warnings")
 
 
 # Build the CUDA code
-add_executable(sgemm_cuda sgemm.cu)
-target_compile_features(sgemm_cuda PUBLIC cxx_std_11)
-set_target_properties(sgemm_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_property(TARGET sgemm_cuda PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
-target_link_libraries(sgemm_cuda CUDA::toolkit CUDA::cublas)
+add_executable(cuda_sgemm sgemm.cu)
+target_compile_features(cuda_sgemm PUBLIC cxx_std_11)
+set_target_properties(cuda_sgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_property(TARGET cuda_sgemm PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
+target_link_libraries(cuda_sgemm CUDA::toolkit CUDA::cublas)
 
 # Build the SYCL code
 add_executable (sycl_sgemm sycl_sgemm.cpp)
 
@@ -17,11 +17,14 @@ Building the example
 =====================
 
 ``` sh
-$ mkdir build && cd build
-$ cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
-$ make -j 8
+$ bash build.sh
 ```
 
+or (SYCL version only):
+
+```
+${SYCL_ROOT_DIR}/bin/clang++ -DCUDA_NO_HALF -isystem /usr/local/cuda/include -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -std=gnu++17 -L/usr/local/cuda/lib64  -lcublas -lcudart -lcuda -o sycl_sgemm sycl_sgemm.cpp
+```
 Example
 =========
 
 
@@ -0,0 +1,3 @@
+rm -rf build && mkdir build && cd build
+cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
+make -j
@@ -33,16 +33,13 @@ void inline checkCudaErrorMsg(CUresult status, const char *msg) {
 
 class CUDASelector : public sycl::device_selector {
 public:
-  int operator()(const sycl::device &Device) const override {
-    using namespace sycl::info;
-
-    const std::string DriverVersion = Device.get_info<device::driver_version>();
-
-    if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
+  int operator()(const sycl::device &device) const override {
+    if(device.get_platform().get_backend() == sycl::backend::cuda){
       std::cout << " CUDA device found " << std::endl;
       return 1;
-    };
-    return -1;
+    } else{
+      return -1;
+    }
   }
 };
 
@@ -83,7 +80,7 @@ int main() {
       auto d_B = b_B.get_access<sycl::access::mode::read>(h);
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
-      h.codeplay_host_task([=](sycl::interop_handle ih) {
+      h.host_task([=](sycl::interop_handle ih) {
         cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
         cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
         auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_A));
-Original file line number
+Diff line change
 *.exe
 *.out
 *.app
++
 +# Temporaries
 +*~
 +*#
 +*/build
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+rm -rf build && mkdir build && cd build`
	`2`	`+cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ -DCMAKE_EXPORT_COMPILE_COMMANDS=yes`
	`3`	`+make -j 8`