refactor and const correctness

Zixin Zhang · Zixin Zhang · commit 5f90d77a8623 · 2021-09-20T15:08:21.000-04:00
diff --git a/src/main.cpp b/src/main.cpp
@@ -16,7 +16,7 @@
 #include "testing_helpers.hpp"
 
 // The tests default to an array of size 1 << 8 = 256
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 9; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -112,14 +112,14 @@ int main(int argc, char* argv[]) {
     printf("\n");
 
 #endif
+    
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
-
-#if 0
+    
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
@@ -133,6 +133,9 @@ int main(int argc, char* argv[]) {
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
+#if 0
+
+
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -17,13 +17,10 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
- 
-        __global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray, 
-            int* SumArray, int inputSize)
+
+        __device__ void computeScanToOutputArray(const int* inputArray, int* outputArray,
+            int* XY, int inputSize)
         {
-            // Each thread loads one value from the input array into shared 
-            // memory array XY
-            __shared__ int XY[sectionSize];
             int i = blockIdx.x * blockDim.x + threadIdx.x;
             // convert inclusive scan into exclusive scan by shifting 
             // all elements to the right by one position and fill the frist 
@@ -39,7 +36,7 @@ namespace StreamCompaction {
             for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
             {
                 // make sure that input is in place
-                __syncthreads(); 
+                __syncthreads();
                 int index = threadIdx.x;
                 int previousIndex = index - stride;
                 if (previousIndex < 0)
@@ -54,6 +51,15 @@ namespace StreamCompaction {
 
             // each thread writes its result into the output array
             outputArray[i] = XY[threadIdx.x];
+        }
+        
+        __global__ void kernNaiveGPUScanFirstStep(const int* inputArray, 
+            int* outputArray, int* SumArray, int inputSize)
+        {
+            // Each thread loads one value from the input array into shared 
+            // memory array XY
+            __shared__ int XY[sectionSize];
+            computeScanToOutputArray(inputArray, outputArray, XY, inputSize);
 
             // the last thread in the block should write the output value of 
             // the last XY element in the block to the blockIdx.x position of 
@@ -67,46 +73,17 @@ namespace StreamCompaction {
             }
         }
 
-        __global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray,
-            int inputSize)
+        __global__ void kernNaiveGPUScanSecondStep(const int* inputArray, 
+            int* outputArray, int inputSize)
         {
             // Each thread loads one value from the input array into shared 
             // memory array XY
             __shared__ int XY[sectionSize];
-            int i = blockIdx.x * blockDim.x + threadIdx.x;
-            // convert inclusive scan into exclusive scan by shifting 
-            // all elements to the right by one position and fill the frist 
-            // element and out-of-bound elements with 0. 
-            if (i < inputSize && threadIdx.x != 0)
-            {
-                XY[threadIdx.x] = inputArray[i - 1];
-            }
-            else {
-                XY[threadIdx.x] = 0;
-            }
-            // perform naive scan
-            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
-            {
-                // make sure that input is in place
-                __syncthreads();
-                int index = threadIdx.x;
-                int previousIndex = index - stride;
-                if (previousIndex < 0)
-                {
-                    previousIndex = 0;
-                }
-                int temp = XY[index] + XY[previousIndex];
-                // make sure previous output has been consumed
-                __syncthreads();
-                XY[index] = temp;
-            }
-
-            // each thread writes its result into the output array
-            outputArray[i] = XY[threadIdx.x];
+            computeScanToOutputArray(inputArray, outputArray, XY, inputSize);
         }
 
-        __global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray,
-            int inputSize)
+        __global__ void kernNaiveGPUScanThirdStep(const int* inputArray, 
+            int* outputArray, int inputSize)
         {
             int i = blockIdx.x * blockDim.x + threadIdx.x;
             if (i < inputSize && blockIdx.x > 0)
@@ -141,16 +118,24 @@ namespace StreamCompaction {
             cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
             cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
 
-            dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
-            dim3 dimBlock(blockSize, 1, 1);
+            dim3 dimGridArray((n + blockSize - 1) / blockSize, 1, 1);
+            dim3 dimBlockArray(blockSize, 1, 1);
+
+            dim3 dimGridSumArray((sumArrayNumEle + blockSize - 1) / blockSize, 1, 1);
+            dim3 dimBlockSumArray(blockSize, 1, 1);
+
 
             timer().startGpuTimer();
             // First step: compute the scan result for individual sections
             // then, store their block sum to sumArray
-            kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock>>> (d_InputData,
+            kernNaiveGPUScanFirstStep <<<dimGridArray, dimBlockArray >>> (d_InputData,
                 d_OutputData, d_SumArray, n);
             checkCUDAError("kernNaiveGPUScanFirstStep failed!");
+
+          
 #if 0
+            kernNaiveGPUScanSecondStep << <dimGridSumArray, dimBlockSumArray >> > (
+                sumArray, sumArray, sumArrayNumEle);
             // cudaDeviceSynchronize();
 
             kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
@@ -170,6 +155,7 @@ namespace StreamCompaction {
             cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
             checkCUDAError("memCpy back failed!");
 
+#if 1
             // testing: 
             cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
             checkCUDAError("memCpy back failed!");
@@ -182,7 +168,7 @@ namespace StreamCompaction {
             {
                 std::cout << odata[i] << '\n';
             }
-            
+#endif
 
             // cleanup
             cudaFree(d_InputData);