diff --git a/README.md b/README.md
index 0e38ddb..48744cf 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,93 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Guanlin Huang
+  * [LinkedIn](https://www.linkedin.com/in/guanlin-huang-4406668502/), [personal website](virulentkid.github.io/personal_web/index.html)
+* Tested on: Windows 11, i9-10900K @ 4.9GHz 32GB, RTX3080 10GB; Compute Capability: 8.6
 
-### (TODO: Your README)
+## Project Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Stream compaction is an algorithm to produce a smaller output array that contains the indices of the desired elements from the input array;
+such an algorithm will come in handy in ray path tracing of computer graphics or huge sparse matrix compression in AI.
+The performance of element selection is extremely important due to the enormous quantity of data pieces that need to be filtered. 
+Modern Graphics Processing Units (GPUs) have been utilized more frequently lately to speed up the processing of extremely big, concurrent data applications. 
+In this Project, the stream compaction algorithm is implemented in conjunction of an algorithm called prefix-sum, or, "scan." 
+Here is an visual representation of that algorithm:
 
+!["Scan Algorithm"](img/figure-39-2.jpg)
+
+The following perfomance analysis of scan and stream compaction will show the benefit of using GPU comparing to CPU only.
+
+
+## Performance Analysis
+
+!["Scan"](img/scan.png)
+!["Stream Compaction"](img/comp.png)
+As the charts are shown, when array size is large, both naive and work-efficient algorithm of GPU out-perform the CPU implemtation.
+
+### Performance Bottleneck
+* GPU-based implementations are constrained by global memory reading performance at small array sizes because of no optimization of using share memory.
+However, due to the advantages of parallel processing, GPU implementations see a less drastic rise in runtime comparing to the CPU one.
+
+### Test output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [   0   7  11   5  35  28  17  42  37  26  38   7  26 ...  25   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 33.3664ms    (std::chrono Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 25.4672ms    (std::chrono Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757208 821757214 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 11.3899ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 12.0769ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 11.4359ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757286 821757311 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 10.0679ms    (CUDA Measured)
+    [   0   0   7  18  23  58  86 103 145 182 208 246 253 ... 821757208 821757214 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.05677ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.978368ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   1   3   1   0   3   2   1   2   2   2   3   3 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 59.5784ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 60.795ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 129.865ms    (std::chrono Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 16.1052ms    (CUDA Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   2   2 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 16.0944ms    (CUDA Measured)
+    [   2   1   3   1   3   2   1   2   2   2   3   3   3 ...   3   2 ]
+    passed
+Press any key to continue . . .
diff --git a/img/comp.png b/img/comp.png
new file mode 100644
index 0000000..fe4aaa5
Binary files /dev/null and b/img/comp.png differ
diff --git a/img/scan.png b/img/scan.png
new file mode 100644
index 0000000..ff85f1a
Binary files /dev/null and b/img/scan.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..879c8f6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 25; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
@@ -64,21 +64,21 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -137,14 +137,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..504514b 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <device_launch_parameters.h>
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = idata[index] == 0 ? 0 : 1;
         }
 
         /**
@@ -33,6 +40,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index] == 1) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..d7a39ae 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -4,47 +4,86 @@
 #include "common.h"
 
 namespace StreamCompaction {
-    namespace CPU {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * CPU scan (prefix sum).
-         * For performance analysis, this is supposed to be a simple for loop.
-         * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-        }
-
-        /**
-         * CPU stream compaction without using the scan function.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithoutScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-
-        /**
-         * CPU stream compaction using scan and scatter, like the parallel version.
-         *
-         * @returns the number of elements remaining after compaction.
-         */
-        int compactWithScan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
-            return -1;
-        }
-    }
+	namespace CPU {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		//helper function for stream compaction to remove timer error
+		void cpuScan(int n, int* odata, const int* idata) {
+			if (n > 0) {
+				odata[0] = 0;
+				
+				int prevSum = 0;
+				for (int i = 1; i < n; i++) {
+					odata[i] = idata[i - 1] + prevSum;
+					prevSum = odata[i];
+				}
+			}
+		}
+
+		/**
+		 * CPU scan (prefix sum).
+		 * For performance analysis, this is supposed to be a simple for loop.
+		 * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			// TODO
+			cpuScan(n, odata, idata);
+			timer().endCpuTimer();
+		}
+
+		/**
+		 * CPU stream compaction without using the scan function.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithoutScan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			// TODO
+			int trueCounts = 0;
+			for (int i = 0; i < n; i++) {
+				if (idata[i] != 0) {
+					odata[trueCounts] = idata[i];
+					trueCounts++;
+				}
+			}
+			timer().endCpuTimer();
+			return trueCounts;
+		}
+
+		/**
+		 * CPU stream compaction using scan and scatter, like the parallel version.
+		 *
+		 * @returns the number of elements remaining after compaction.
+		 */
+		int compactWithScan(int n, int* odata, const int* idata) {
+			timer().startCpuTimer();
+			int* boolArr = new int[n];
+			int* scanArr = new int[n];
+
+			for (int i = 0; i < n; i++) {
+				boolArr[i] = idata[i] == 0 ? 0 : 1;
+			}
+
+			cpuScan(n, scanArr, boolArr);
+
+			int trueCounts = 0;
+			for (int i = 0; i < n; i++) {
+				if (boolArr[i] == 1) {
+					trueCounts++;
+					odata[scanArr[i]] = idata[i];
+					
+				}
+			}
+			delete[] boolArr;
+			delete[] scanArr;
+			timer().endCpuTimer();
+			return trueCounts;
+		}
+	}
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..523721f 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,39 +2,164 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <device_launch_parameters.h>
+
+#define blockSize 128
+int* dev_idata;
+int* dev_odata;
+int* dev_scan_idata;
+int* dev_boolArr;
+int* dev_indexArr;
 
 namespace StreamCompaction {
-    namespace Efficient {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-
-        /**
-         * Performs stream compaction on idata, storing the result into odata.
-         * All zeroes are discarded.
-         *
-         * @param n      The number of elements in idata.
-         * @param odata  The array into which to store elements.
-         * @param idata  The array of elements to compact.
-         * @returns      The number of elements remaining after compaction.
-         */
-        int compact(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-            return -1;
-        }
-    }
+	namespace Efficient {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		__global__ void kernUpSweep(int n, int* idata, int d) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			if (index % (1 << (d + 1)) == 0) {
+				idata[index + (1 << (d + 1)) - 1] += idata[index + (1 << d) - 1];
+			}
+		}
+
+		__global__ void kernSetRootNode(int n, int* idata) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			idata[n - 1] = 0;
+		}
+
+		__global__ void kernDownSweep(int n, int* idata, int d) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			if (index % ((1 << (d + 1))) == 0) {
+				int t = idata[index + (1 << d) - 1];
+				idata[index + (1 << d) - 1] = idata[index + (1 << (d + 1)) - 1];
+				idata[index + (1 << (d + 1)) - 1] += t;
+			}
+		}
+
+		//helper function for stream compaction to remove timer error
+		void eff_scan(int n, int* odata, const int* idata) {
+			int log2n = ilog2ceil(n);
+			int sizeOfArr = 1 << log2n;
+
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_scan_idata, n * sizeof(int));
+			cudaMemcpy(dev_scan_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			for (int d = 0; d <= log2n - 1; d++) {
+				kernUpSweep << <fullBlocksPerGrid, blockSize >> > (sizeOfArr, dev_scan_idata, d);
+			}
+
+			kernSetRootNode << <1, 1 >> > (sizeOfArr, dev_scan_idata);
+
+			for (int d = log2n - 1; d >= 0; d--) {
+				kernDownSweep << <fullBlocksPerGrid, blockSize >> > (sizeOfArr, dev_scan_idata, d);
+			}
+
+			cudaMemcpy(odata, dev_scan_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_scan_idata);
+		}
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+
+			int log2n = ilog2ceil(n);
+			int sizeOfArr = 1 << log2n;
+
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+			cudaMalloc((void**)&dev_scan_idata, n * sizeof(int));
+			cudaMemcpy(dev_scan_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			timer().startGpuTimer();
+			for (int d = 0; d <= log2n - 1; d++) {
+				kernUpSweep << <fullBlocksPerGrid, blockSize >> > (sizeOfArr, dev_scan_idata, d);
+			}
+
+			kernSetRootNode << <1, 1 >> > (sizeOfArr, dev_scan_idata);
+
+			for (int d = log2n - 1; d >= 0; d--) {
+				kernDownSweep << <fullBlocksPerGrid, blockSize >> > (sizeOfArr, dev_scan_idata, d);
+			}
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_scan_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_scan_idata);
+
+		}
+
+		/**
+		 * Performs stream compaction on idata, storing the result into odata.
+		 * All zeroes are discarded.
+		 *
+		 * @param n      The number of elements in idata.
+		 * @param odata  The array into which to store elements.
+		 * @param idata  The array of elements to compact.
+		 * @returns      The number of elements remaining after compaction.
+		 */
+		int compact(int n, int* odata, const int* idata) {
+
+			// TODO
+
+			int log2n = ilog2ceil(n);
+			int sizeOfArr = 1 << log2n;
+
+			dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+			cudaMalloc((void**)&dev_idata, n * sizeof(int));
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			cudaMalloc((void**)&dev_boolArr, sizeOfArr * sizeof(int));
+			cudaMalloc((void**)&dev_indexArr, sizeOfArr * sizeof(int));
+			cudaMalloc((void**)&dev_odata, sizeof(int) * n);
+
+			timer().startGpuTimer();
+			StreamCompaction::Common::kernMapToBoolean << <fullBlocksPerGrid, blockSize >> > (sizeOfArr, dev_boolArr, dev_idata);
+
+			eff_scan(n, dev_indexArr, dev_boolArr);
+
+			StreamCompaction::Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n, dev_odata, dev_idata, dev_boolArr, dev_indexArr);
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			int* indices = new int[n];
+			int count = 0;
+			cudaMemcpy(indices, dev_indexArr, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+			cudaFree(dev_idata);
+			cudaFree(dev_odata);
+			cudaFree(dev_boolArr);
+			cudaFree(dev_indexArr);
+
+
+			if (idata[n - 1] == 0) {
+				count = indices[n - 1];
+			}
+			else {
+				count = indices[n - 1] + 1;
+			}
+
+			delete[] indices;
+			return count;
+		}
+	}
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..482a858 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,24 +2,85 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <device_launch_parameters.h>
+
+#define blockSize 128
+int* dev_bufferA;
+int* dev_bufferB;
+
 
 namespace StreamCompaction {
-    namespace Naive {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        // TODO: __global__
-
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
-        }
-    }
+	namespace Naive {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+		// TODO: __global__
+		__global__ void kernScanIteration(int n, int* odata, int* idata, int d) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			if (index >= (1 << (d-1))) {
+				odata[index] = idata[index - (1 << (d - 1))] + idata[index];
+			}
+			else {
+				odata[index] = idata[index];
+			}
+		}
+
+		__global__ void kernCopyExclusive(int n, int* exclusive, int* inclusive) {
+			int index = blockIdx.x * blockDim.x + threadIdx.x;
+			if (index >= n) {
+				return;
+			}
+
+			if (index == 0) {
+				exclusive[index] = 0;
+			}
+			else {
+				exclusive[index] = inclusive[index - 1];
+			}
+		}
+
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+			if (n == 0) {
+				return;
+			}
+
+			cudaMalloc((void**)&dev_bufferA, n * sizeof(int));
+			cudaMalloc((void**)&dev_bufferB, n * sizeof(int));
+			cudaMemcpy(dev_bufferA, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+			timer().startGpuTimer();
+			int log2n = ilog2ceil(n);
+			dim3 fullBlocksPerGrid((n  + blockSize - 1) / blockSize);
+			
+			for (int d = 1; d <= log2n; d++) {
+				kernScanIteration << <fullBlocksPerGrid, blockSize >> > (n, dev_bufferB, dev_bufferA, d);
+
+				if (d < log2n) {
+					int* tempPtr = dev_bufferB;
+					dev_bufferB = dev_bufferA;
+					dev_bufferA = tempPtr;
+				}
+
+			}
+			//the inclusive scan is stored in bufferB
+			kernCopyExclusive << <fullBlocksPerGrid, blockSize >> > (n, dev_bufferA, dev_bufferB);
+			timer().endGpuTimer();
+
+			cudaMemcpy(odata, dev_bufferA, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_bufferB);
+			cudaFree(dev_bufferA);
+
+			
+		}
+	}
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..17fb36b 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -7,22 +7,28 @@
 #include "thrust.h"
 
 namespace StreamCompaction {
-    namespace Thrust {
-        using StreamCompaction::Common::PerformanceTimer;
-        PerformanceTimer& timer()
-        {
-            static PerformanceTimer timer;
-            return timer;
-        }
-        /**
-         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
-         */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
-        }
-    }
+	namespace Thrust {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+		/**
+		 * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+		 */
+		void scan(int n, int* odata, const int* idata) {
+
+			// TODO use `thrust::exclusive_scan`
+			// example: for device_vectors dv_in and dv_out:
+			// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+			thrust::device_vector<int> dev_odata(odata, odata + n);
+			thrust::device_vector<int> dev_idata(idata, idata + n);
+			timer().startGpuTimer();
+			thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());
+			timer().endGpuTimer();
+			thrust::copy(dev_odata.begin(), dev_odata.end(), odata);
+
+		}
+	}
 }