Tested kernNaiveGPUScanFirstStep power/non power of two

Zixin Zhang · Zixin Zhang · commit a5cae919402c · 2021-09-20T14:39:14.000-04:00
diff --git a/src/main.cpp b/src/main.cpp
@@ -80,6 +80,7 @@ int main(int argc, char* argv[]) {
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
+#if 0
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
     // At first all cases passed because b && c are all zeroes.
@@ -110,7 +111,7 @@ int main(int argc, char* argv[]) {
 
     printf("\n");
 
-
+#endif
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
@@ -162,6 +163,7 @@ int main(int argc, char* argv[]) {
 
 #endif
 
+#if 0
     printf("\n");
     printf("*****************************\n");
     printf("** STREAM COMPACTION TESTS **\n");
@@ -200,7 +202,7 @@ int main(int argc, char* argv[]) {
     printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
-#if 0
+
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#include <iostream> // testing 
+
 /*! Block size used for CUDA kernel launch. */
 #define blockSize 128
 #define sectionSize 128
@@ -16,8 +18,8 @@ namespace StreamCompaction {
             return timer;
         }
  
-        __global__ void kernNaiveGPUScan(int* inputArray, int* outputArray,
-            int inputSize)
+        __global__ void kernNaiveGPUScanFirstStep(int* inputArray, int* outputArray, 
+            int* SumArray, int inputSize)
         {
             // Each thread loads one value from the input array into shared 
             // memory array XY
@@ -52,6 +54,65 @@ namespace StreamCompaction {
 
             // each thread writes its result into the output array
             outputArray[i] = XY[threadIdx.x];
+
+            // the last thread in the block should write the output value of 
+            // the last XY element in the block to the blockIdx.x position of 
+            // SumArray
+
+            // make sure XY[sectionSize - 1] has the correct partial sum
+            __syncthreads(); 
+            if (threadIdx.x == blockDim.x - 1)
+            {
+                SumArray[blockIdx.x] = XY[sectionSize - 1];
+            }
+        }
+
+        __global__ void kernNaiveGPUScanSecondStep(int* inputArray, int* outputArray,
+            int inputSize)
+        {
+            // Each thread loads one value from the input array into shared 
+            // memory array XY
+            __shared__ int XY[sectionSize];
+            int i = blockIdx.x * blockDim.x + threadIdx.x;
+            // convert inclusive scan into exclusive scan by shifting 
+            // all elements to the right by one position and fill the frist 
+            // element and out-of-bound elements with 0. 
+            if (i < inputSize && threadIdx.x != 0)
+            {
+                XY[threadIdx.x] = inputArray[i - 1];
+            }
+            else {
+                XY[threadIdx.x] = 0;
+            }
+            // perform naive scan
+            for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
+            {
+                // make sure that input is in place
+                __syncthreads();
+                int index = threadIdx.x;
+                int previousIndex = index - stride;
+                if (previousIndex < 0)
+                {
+                    previousIndex = 0;
+                }
+                int temp = XY[index] + XY[previousIndex];
+                // make sure previous output has been consumed
+                __syncthreads();
+                XY[index] = temp;
+            }
+
+            // each thread writes its result into the output array
+            outputArray[i] = XY[threadIdx.x];
+        }
+
+        __global__ void kernNaiveGPUScanThirdStep(int* inputArray, int* outputArray,
+            int inputSize)
+        {
+            int i = blockIdx.x * blockDim.x + threadIdx.x;
+            if (i < inputSize && blockIdx.x > 0)
+            {
+                outputArray[i] += inputArray[blockIdx.x - 1];
+            }
         }
 
         /**
@@ -61,32 +122,75 @@ namespace StreamCompaction {
             int size = n * sizeof(int);
             int* d_InputData;
             int* d_OutputData;
+            int sumArrayNumEle = (n + blockSize - 1) / blockSize;
+            int sumArraySize = sumArrayNumEle * sizeof(int);
+            int* d_SumArray;
+
+            // for testing
+            int* sumArray = new int[sumArrayNumEle];
 
             cudaMalloc((void**)&d_InputData, size);
             checkCUDAError("cudaMalloc d_InputData failed!");
 
             cudaMalloc((void**)&d_OutputData, size);
             checkCUDAError("cudaMalloc d_OutputData failed!");
 
+            cudaMalloc((void**)&d_SumArray, sumArraySize);
+            checkCUDAError("cudaMalloc d_SumArray failed!");
+
             cudaMemcpy(d_InputData, idata, size, cudaMemcpyHostToDevice);
             cudaMemcpy(d_OutputData, odata, size, cudaMemcpyHostToDevice);
 
             dim3 dimGrid((n + blockSize - 1) / blockSize, 1, 1);
             dim3 dimBlock(blockSize, 1, 1);
 
             timer().startGpuTimer();
-            kernNaiveGPUScan <<<dimGrid, dimBlock>>> (d_InputData,
-                d_OutputData, n);
-            checkCUDAError("kernNaiveGPUScan failed!");
+            // First step: compute the scan result for individual sections
+            // then, store their block sum to sumArray
+            kernNaiveGPUScanFirstStep <<<dimGrid, dimBlock>>> (d_InputData,
+                d_OutputData, d_SumArray, n);
+            checkCUDAError("kernNaiveGPUScanFirstStep failed!");
+#if 0
+            // cudaDeviceSynchronize();
+
+            kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
+                d_OutputData, d_SumArray, n);
+            checkCUDAError("kernNaiveGPUScanFirstStep failed!");
+
+            // cudaDeviceSynchronize();
+
+            kernNaiveGPUScanFirstStep << <dimGrid, dimBlock >> > (d_InputData,
+                d_OutputData, d_SumArray, n);
+            checkCUDAError("kernNaiveGPUScanFirstStep failed!");
+
+            // cudaDeviceSynchronize();
+#endif
             timer().endGpuTimer();
 
             cudaMemcpy(odata, d_OutputData, size, cudaMemcpyDeviceToHost);
             checkCUDAError("memCpy back failed!");
 
+            // testing: 
+            cudaMemcpy(sumArray, d_SumArray, sumArraySize, cudaMemcpyDeviceToHost);
+            checkCUDAError("memCpy back failed!");
+            for (int i = 0; i < sumArrayNumEle; i++)
+            {
+                std::cout << sumArray[i] << '\n';
+            }
+            printf("\n");
+            for (int i = 0; i < n; i++)
+            {
+                std::cout << odata[i] << '\n';
+            }
+            
+
             // cleanup
             cudaFree(d_InputData);
             cudaFree(d_OutputData);
             checkCUDAError("cudaFree failed!");
+
+            // testing clean up
+            delete[] sumArray;
         }
     }
 }