Addressing feedback from review

Ruyman Reyes · Ruyman Reyes · commit 090436764a75 · 2020-06-03T14:44:16.000Z
diff --git a/example-03/README.md b/example-03/README.md
@@ -14,7 +14,7 @@ Pre-requisites
 ---------------
 
 You would need an installation of DPC++ with CUDA support, 
-see [Getting Started Guide](https://github.com/intel/llvm/doc/GetStartedWithSYCLCompiler.md) 
+see [Getting Started Guide](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-nvidia-cuda)
 for details on how to build it.
 
 The example is built using Makefiles, since there is no support yet on
@@ -24,7 +24,7 @@ Building the example
 ---------------------
 
 ```sh
-$ SYCL_ROOT=/path/to/dpcpp  make  
+$ SYCL_ROOT=/path/to/dpcpp   make  
 ```
 
 This compiles the SYCL code with the LLVM CUDA support, and generates
@@ -42,7 +42,7 @@ The path to `libsycl.so` and the PI plugins must be in `LD_LIBRARY_PATH`.
 A simple way of running the example is as follows:
 
 ```
-$ LD_LIBRARY_PATH=/path/to/dpcpp/lib  ./vec_add.exe
+$ LD_LIBRARY_PATH=/path/to/dpcpp/lib:$LD_LIBRARY_PATH  ./vec_add.exe
 ```
 
 
diff --git a/example-03/vec_add.cu b/example-03/vec_add.cu
@@ -16,7 +16,7 @@ public:
     const std::string DriverVersion = Device.get_info<device::driver_version>();
 
     if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
-      std::cout << " CUDA device found " << std::endl;
+      std::cout << " CUDA device found \n";
       return 1;
     };
     return -1;
@@ -41,23 +41,23 @@ int main(int argc, char *argv[]) {
   // Create a SYCL context for interoperability with CUDA Runtime API
   // This is temporary until the property extension is implemented
   const bool UsePrimaryContext = true;
-  sycl::device dev{CUDASelector().select_device()};
-  sycl::context myContext{dev, {}, UsePrimaryContext};
-  sycl::queue myQueue{myContext, dev};
+  device dev{CUDASelector().select_device()};
+  context myContext{dev, {}, UsePrimaryContext};
+  queue myQueue{myContext, dev};
 
   {
     buffer<double> bA{range<1>(n)};
     buffer<double> bB{range<1>(n)};
     buffer<double> bC{range<1>(n)};
 
     {
-      auto h_a = bA.get_access<access::mode::write>();
-      auto h_b = bB.get_access<access::mode::write>();
+      auto hA = bA.get_access<access::mode::write>();
+      auto hB = bB.get_access<access::mode::write>();
 
       // Initialize vectors on host
       for (int i = 0; i < n; i++) {
-        h_a[i] = sin(i) * sin(i);
-        h_b[i] = cos(i) * cos(i);
+        hA[i] = sin(i) * sin(i);
+        hB[i] = cos(i) * cos(i);
       }
     }
 
@@ -68,28 +68,29 @@ int main(int argc, char *argv[]) {
       auto accC = bC.get_access<access::mode::write>(h);
 
       h.interop_task([=](interop_handler ih) {
-        auto d_a = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
-        auto d_b = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
-        auto d_c = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
+        auto dA = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accA));
+        auto dB = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accB));
+        auto dC = reinterpret_cast<double*>(ih.get_mem<backend::cuda>(accC));
 
         int blockSize, gridSize;
         // Number of threads in each thread block
         blockSize = 1024;
         // Number of thread blocks in grid
-        gridSize = (int)ceil((float)n / blockSize);
+        gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
         // Call the CUDA kernel directly from SYCL
-        vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+        vecAdd<<<gridSize, blockSize>>>(dA, dB, dC, n);
       });
     });
 
     {
-     auto h_c = bC.get_access<access::mode::read>();
+     auto hC = bC.get_access<access::mode::read>();
      // Sum up vector c and print result divided by n, this should equal 1 within
      // error
      double sum = 0;
-      for (int i = 0; i < n; i++)
-        sum += h_c[i];
-      printf("final result: %f\n", sum / n);
+     for (int i = 0; i < n; i++) {
+        sum += hC[i];
+     }
+      std::cout << "Final result " << sum / n << std::endl;
     }
   }
 
diff --git a/example-03/vec_add_usm.cu b/example-03/vec_add_usm.cu
@@ -12,7 +12,7 @@ public:
     const std::string DriverVersion = Device.get_info<device::driver_version>();
 
     if (Device.is_gpu() && (DriverVersion.find("CUDA") != std::string::npos)) {
-      std::cout << " CUDA device found " << std::endl;
+      std::cout << " CUDA device found \n";
       return 1;
     };
     return -1;
@@ -41,33 +41,31 @@ int main(int argc, char *argv[]) {
   // Create a SYCL context for interoperability with CUDA Runtime API
   // This is temporary until the property extension is implemented
   const bool UsePrimaryContext = true;
-  sycl::device dev{CUDASelector().select_device()};
-  sycl::context myContext{dev, {}, UsePrimaryContext};
-  sycl::queue myQueue{myContext, dev};
+  device dev{CUDASelector().select_device()};
+  context myContext{dev, {}, UsePrimaryContext};
+  queue myQueue{myContext, dev};
 
   // Allocate memory for each vector on host
-  double* d_a = (double *)malloc_shared(bytes, myQueue);
-  double* d_b = (double *)malloc_shared(bytes, myQueue);
-  double* d_c = (double *)malloc_shared(bytes, myQueue);
+  auto d_A = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
+  auto d_B = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
+  auto d_C = reinterpret_cast<double*>(malloc_shared(bytes, myQueue));
 
   // Initialize vectors on host
   for (int i = 0; i < n; i++) {
-    d_a[i] = sin(i) * sin(i);
-    d_b[i] = cos(i) * cos(i);
+    d_A[i] = sin(i) * sin(i);
+    d_B[i] = cos(i) * cos(i);
   }
 
   myQueue.submit([&](handler& h) {
       h.interop_task([=](interop_handler ih) {
-        int blockSize, gridSize;
-
         // Number of threads in each thread block
-        blockSize = 1024;
+        int blockSize = 1024;
 
         // Number of thread blocks in grid
-        gridSize = (int)ceil((float)n / blockSize);
+        int gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
 
         // Execute the kernel
-        vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
+        vecAdd<<<gridSize, blockSize>>>(d_A, d_B, d_C, n);
         });
   });
 
@@ -76,13 +74,14 @@ int main(int argc, char *argv[]) {
   // Sum up vector c and print result divided by n, this should equal 1 within
   // error
   double sum = 0;
-  for (int i = 0; i < n; i++)
-    sum += d_c[i];
-  printf("final result: %f\n", sum / n);
+  for (int i = 0; i < n; i++) {
+    sum += d_C[i];
+  }
+  std::cout << "Final result " << sum / n << std::endl;
 
-  sycl::free(d_a, myContext);
-  sycl::free(d_b, myContext);
-  sycl::free(d_c, myContext);
+  free(d_A, myContext);
+  free(d_B, myContext);
+  free(d_C, myContext);
 
   return 0;
 }