Bloopers 1

Zixin Zhang · Zixin Zhang · commit 152f9d29f395 · 2021-09-20T11:56:02.000-04:00
diff --git a/src/main.cpp b/src/main.cpp
@@ -7,6 +7,8 @@
  */
 
 #include <cstdio>
+#include <sstream>
+#include <fstream>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
@@ -24,7 +26,49 @@ int* bookArraya = new int[8]{ 3, 1, 7, 0 ,4 ,1 ,6, 3 };
 int* bookArrayb = new int[8]{};
 const int BOOK_SIZE = 8;
 
+std::string deviceName;
+int deviceMaxThreadsPerBlock;
+int deviceSharedMemPerBlock;
+int deviceMaxThreadsPerSM;
+int deviceMaxBlocksPerSM;
+
 int main(int argc, char* argv[]) {
+    cudaDeviceProp deviceProp;
+    int gpuDevice = 0;
+    int device_count = 0;
+    cudaGetDeviceCount(&device_count);
+    if (gpuDevice > device_count) {
+        std::cout
+            << "Error: GPU device number is greater than the number of devices!"
+            << " Perhaps a CUDA-capable GPU is not installed?"
+            << std::endl;
+        return false;
+    }
+    cudaGetDeviceProperties(&deviceProp, gpuDevice);
+    int major = deviceProp.major;
+    int minor = deviceProp.minor;
+    deviceMaxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
+    deviceSharedMemPerBlock = deviceProp.sharedMemPerBlock;
+    deviceMaxThreadsPerSM = deviceProp.maxThreadsPerMultiProcessor;
+    deviceMaxBlocksPerSM = deviceProp.maxBlocksPerMultiProcessor;
+  
+   
+
+    std::ostringstream ss;
+    ss << " [SM " << major << "." << minor << " " << deviceProp.name << "]"
+        << "\n Max threads per block: " << deviceMaxThreadsPerBlock
+        << "\n Shared memory per block: " << deviceSharedMemPerBlock << " bytes"
+        // << "\n Shared memory in each block can fit " << deviceSharedMemPerBlock / sizeof(int) << " number of integers"
+        << "\n Max threads per SM: " << deviceMaxThreadsPerSM
+        << "\n Max blocks per SM: " << deviceMaxBlocksPerSM
+        << "\n Max grid size: " << deviceProp.maxGridSize[0] << ", "
+        << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2];
+
+
+    deviceName = ss.str();
+
+    std::cout << deviceName << '\n';
+
     // Scan tests
 
     printf("\n");
@@ -66,14 +110,15 @@ int main(int argc, char* argv[]) {
 
     printf("\n");
 
-#if 0
+
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
+#if 0
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -3,6 +3,10 @@
 #include "common.h"
 #include "naive.h"
 
+/*! Block size used for CUDA kernel launch. */
+#define blockSize 128
+#define sectionSize 128
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +15,80 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+ 
+        __global__ void kernNaiveGPUScan(int* inputArray, int* outputArray,
+            int inputSize)
+        {
+            // Each thread loads one value from the input array into shared 
+            // memory array XY
+            __shared__ int XY[sectionSize];
+            int i = blockIdx.x * blockDim.x + threadIdx.x;
+            // convert inclusive scan into exclusive scan by shifting 
+            // all elements to the right by one position and fill the frist 
+            // element and out-of-bound elements with 0. 
+            if (i < inputSize && threadIdx.x != 0)
+            {
+                XY[threadIdx.x] = inputArray[i - 1];
+            }
+            else {
+                XY[threadIdx.x] = 0;
+            }
+            // perform naive scan
+            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+            {
+                // make sure that input is in place
+                __syncthreads(); 
+                int index = threadIdx.x;
+                int previousIndex = index - stride;
+#if 0
+                if (previousIndex < 0)
+                {
+                    previousIndex = 0;
+                }
+#endif
+                int temp = XY[index] + XY[previousIndex];
+                // make sure previous output has been consumed
+                __syncthreads();
+                XY[index] = temp;
+            }
+
+            // each thread writes its result into the output array
+            outputArray[i] = XY[threadIdx.x];
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int size = n * sizeof(int);
+            int* d_InputData;
+            int* d_OutputData;
+
+            cudaMalloc((void**)&d_InputData, size);
+            checkCUDAError("cudaMalloc d_InputData failed!");
+
+            cudaMalloc((void**)&d_OutputData, size);
+            checkCUDAError("cudaMalloc d_OutputData failed!");
+
+            cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
+            cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
+
+            dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
+            dim3 dimBlock(blockSize, 1, 1);
+
             timer().startGpuTimer();
-            // TODO
+            kernNaiveGPUScan <<<dimGrid, dimBlock>>> (d_InputData,
+                d_OutputData, n);
+            checkCUDAError("kernNaiveGPUScan failed!");
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
+            checkCUDAError("memCpy back failed!");
+
+            // cleanup
+            cudaFree(d_InputData);
+            cudaFree(d_OutputData);
+            checkCUDAError("cudaFree failed!");
         }
     }
 }