CIS565-Fall-2022 · VirulentKid · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/README.md b/README.md
@@ -3,12 +3,93 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Guanlin Huang
+  * [LinkedIn](https://www.linkedin.com/in/guanlin-huang-4406668502/), [personal website](virulentkid.github.io/personal_web/index.html)
+* Tested on: Windows 11, i9-10900K @ 4.9GHz 32GB, RTX3080 10GB; Compute Capability: 8.6
 
-### (TODO: Your README)
+## Project Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Stream compaction is an algorithm to produce a smaller output array that contains the indices of the desired elements from the input array;
+such an algorithm will come in handy in ray path tracing of computer graphics or huge sparse matrix compression in AI.
+The performance of element selection is extremely important due to the enormous quantity of data pieces that need to be filtered. 
+Modern Graphics Processing Units (GPUs) have been utilized more frequently lately to speed up the processing of extremely big, concurrent data applications. 
+In this Project, the stream compaction algorithm is implemented in conjunction of an algorithm called prefix-sum, or, "scan." 
+Here is an visual representation of that algorithm:
 
+!["Scan Algorithm"](img/figure-39-2.jpg)
+
+The following perfomance analysis of scan and stream compaction will show the benefit of using GPU comparing to CPU only.
+
+
+## Performance Analysis
+
+!["Scan"](img/scan.png)
+!["Stream Compaction"](img/comp.png)
+As the charts are shown, when array size is large, both naive and work-efficient algorithm of GPU out-perform the CPU implemtation.
+
+### Performance Bottleneck
+* GPU-based implementations are constrained by global memory reading performance at small array sizes because of no optimization of using share memory.
+However, due to the advantages of parallel processing, GPU implementations see a less drastic rise in runtime comparing to the CPU one.
+
+### Test output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [   0   7  11   5  35  28  17  42  37  26  38   7  26 ...  25   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 33.3664ms    (std::chrono Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 25.4672ms    (std::chrono Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757208 821757214 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 11.3899ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 12.0769ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 11.4359ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 10.0679ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757208 821757214 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.05677ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.978368ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   1   3   1   0   3   2   1   2   2   2   3   3 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 59.5784ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 60.795ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 129.865ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 16.1052ms    (CUDA Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 16.0944ms    (CUDA Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   3   2 ]
+    passed
+Press any key to continue . . .
diff --git a/img/comp.png b/img/comp.png
diff --git a/img/scan.png b/img/scan.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 25; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,21 +64,21 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <device_launch_parameters.h>
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -33,6 +40,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -4,47 +4,86 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * CPU scan (prefix sum).
-         * For performance analysis, this is supposed to be a simple for loop.
-         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-        }
-
-        /**
-         * CPU stream compaction without using the scan function.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-
-        /**
-         * CPU stream compaction using scan and scatter, like the parallel version.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-    }
+	namespace CPU {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		//helper function for stream compaction to remove timer error
+		void cpuScan(int n, int* odata, const int* idata) {
+			if (n > 0) {
+				odata[0] = 0;
+
+				int prevSum = 0;
+				for (int i = 1; i < n; i++) {
+					odata[i] = idata[i - 1] + prevSum;
+					prevSum = odata[i];
+				}
+			}
+		}
+
+		/**
+		 * CPU scan (prefix sum).
+		 * For performance analysis, this is supposed to be a simple for loop.
+		 * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			// TODO
+			cpuScan(n, odata, idata);
+			timer().endCpuTimer();
+		}
+
+		/**
+		 * CPU stream compaction without using the scan function.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithoutScan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			// TODO
+			int trueCounts = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[trueCounts] = idata[i];
+					trueCounts++;
+				}
+			}
+			timer().endCpuTimer();
+			return trueCounts;
+		}
+
+		/**
+		 * CPU stream compaction using scan and scatter, like the parallel version.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithScan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			int* boolArr = new int[n];
+			int* scanArr = new int[n];
+
+			for (int i = 0; i < n; i++) {
+				boolArr[i] = idata[i] == 0 ? 0 : 1;
+			}
+
+			cpuScan(n, scanArr, boolArr);
+
+			int trueCounts = 0;
+			for (int i = 0; i < n; i++) {
+				if (boolArr[i] == 1) {
+					trueCounts++;
+					odata[scanArr[i]] = idata[i];
+
+				}
+			}
+			delete[] boolArr;
+			delete[] scanArr;
+			timer().endCpuTimer();
+			return trueCounts;
+		}
+	}
 }