diff --git a/README.md b/README.md
index 0e38ddb..a2aee6f 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,91 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
+Project 2 - CUDA Stream Compaction**
+
+* Edward Zhang
+  * https://www.linkedin.com/in/edwardjczhang/
+  * https://zedward23.github.io/personal_Website/
+ 
+* Tested on: Windows 10 Home, i7-11800H @ 2.3GHz, 16.0GB, NVIDIA GeForce RTX 3060 Laptop GPU
+
+## Background
+The project contains an implementation of the Scan and Compaction Algorithms.
+
+### Scan 
+Description: 
+Each index i of a scan output array is the sum of the corresponding elements in the input array at the indices that came before i. This algorithm was implemented in the following ways:
+
+1. CPU - Non-parallel Scan
+2. Naive - Naively Parallel Scan
+3. Efficient - Parallel Scan using Upsweep and Downsweep on a binary tree representation of an array
+4. Thrust - Scan using Thrust API
+
+### Compaction
+Description: 
+Condenses an array into just its non-zero elements without changing its order
+
+1. CPU - Non-parallel Compact
+2. CPU with Scan - Non-parallel Compact while using Scan
+3. GPU - Parallel Compaction using Efficient Parallel Scan
+
+## Block Size Performance Analysis
+
+![](img/Graph0.png)
+
+A blocksize of 256 seems to yield the best results since it was the first size large enough to take advantage of the parallelism offered by the GPU.
+
+## Scan Performance
+### Powers of 2
+
+![](img/Graph1.png)
+
+Observations:
+- CPU Scan is our baseline
+- Thrust Scan is the fastest; this is expected since it is a library provided to us.
+- Efficient and Naive GPU scan were actually fairly inefficient; this is likely due to so suboptimal thread allocation.
+
+### Non-Powers of 2
+
+![](img/Graph2.png)
+
+Observations:
+- The same observations from running the implementations on array lengths that were powers of 2
+
+## Compact
+
+![](img/Graph3.png)
+
+Observations:
+- Compaction without Scan on the CPU is actually faster that with Scan
+- GPU implementations are still slower than the CPU implementations
+
+## Why is My GPU Approach So Slow? (Extra Credit) (+5)
+
+If you implement your efficient scan version following the slides closely, there's a good chance
+that you are getting an "efficient" gpu scan that is actually not that efficient -- it is slower than the cpu approach?
+
+Though it is totally acceptable for this assignment,
+In addition to explain the reason of this phenomena, you are encouraged to try to upgrade your work-efficient gpu scan.
+
+Thinking about these may lead you to an aha moment:
+- What is the occupancy at a deeper level in the upper/down sweep? Are most threads actually working?
+  
+  Most threads are just idling since at each level, less and less indices should be written to.
+  
+- Are you always launching the same number of blocks throughout each level of the upper/down sweep?
+  
+  I am always launching the same number of blocks regardless of how many indices should actually be written to.
+
+- If some threads are being lazy, can we do an early termination on them?
+  
+  Even if we terminate them early, we cannot move onto the next iteration until the ones that need to be written to are properly finished. 
+
+- How can I compact the threads? What should I modify to keep the remaining threads still working correctly?
+  
+  On each iteration, dynamically dispatch the optimal number of threads and blocks that operate only on the specific indices that need to be modified.
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
 
-### (TODO: Your README)
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/Graph0.png b/img/Graph0.png
new file mode 100644
index 0000000..a7c5c8c
Binary files /dev/null and b/img/Graph0.png differ
diff --git a/img/Graph1.png b/img/Graph1.png
new file mode 100644
index 0000000..fced70a
Binary files /dev/null and b/img/Graph1.png differ
diff --git a/img/Graph2.png b/img/Graph2.png
new file mode 100644
index 0000000..cc39f4c
Binary files /dev/null and b/img/Graph2.png differ
diff --git a/img/Graph3.png b/img/Graph3.png
new file mode 100644
index 0000000..2c54f68
Binary files /dev/null and b/img/Graph3.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..c0bc311 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 26; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -51,48 +51,49 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
+    //For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /*onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);*/
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
+    printArray(SIZE, a, true);
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
-    zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, a, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, non-power-of-two");
     StreamCompaction::Thrust::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     printf("\n");
@@ -131,20 +132,21 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     printArray(count, c, true);
+    
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..ea7c16a 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -2,6 +2,7 @@
 #include "cpu.h"
 
 #include "common.h"
+#define IDENTITY 0
 
 namespace StreamCompaction {
     namespace CPU {
@@ -20,6 +21,12 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+
+            odata[0] = idata[0];
+            for (int i = 1; i < n; i++) {
+                odata[i] = idata[i] + odata[i-1];
+            }
+
             timer().endCpuTimer();
         }
 
@@ -31,8 +38,18 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int currIdx = 0;
+
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[currIdx] = idata[i];
+                    currIdx++;
+                }
+            }
+            
+
             timer().endCpuTimer();
-            return -1;
+            return currIdx;
         }
 
         /**
@@ -42,9 +59,45 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            
+            int* scanned = new int[n];
+            //Filter
+            for (int i = 0; i < n; i++) {
+                (idata[i] != 0) ? scanned[i] = 1 : scanned[i] = 0;
+            }
+
+            //Exclusive Scan
+            idata[0] == 0 ? scanned[0] = 0 : scanned[0] = 1;
+            for (int i = 1; i < n; i++) {
+                scanned[i] = scanned[i] + scanned[i - 1];
+            }
+
+            //Scatter
+            int currIdx = 0;
+            //odata[currIdx] = idata[0];
+
+
+
+            //SCATTERING MUTHA FUCKAAAA REEEEE
+            if (idata[0] > 0) {
+                odata[0] = idata[0];
+                currIdx++;
+            }
+
+            for (int i = 1; i < n; i++) {
+                if (scanned[i] > scanned[i - 1]) {
+                    odata[scanned[i]-1] = idata[i];
+                    currIdx++;
+                }
+                else {
+                    odata[i] = 0;
+                }
+            }
+            
+            delete[] scanned;
+
             timer().endCpuTimer();
-            return -1;
+            return currIdx;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..0ac5564 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,12 +12,142 @@ namespace StreamCompaction {
             return timer;
         }
 
+        #define blockSize 8
+
+        int* dev_idata;
+        int* dev_odata;
+        int* dev_buf;
+
+        __global__ void upSweep(int N, int* idata, int* odata, int depth) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+            if ((k+1)%(1 << depth) == 0) {
+                odata[k] = idata[k] + idata[k - (1 << (depth-1))];
+            }
+            else {
+                odata[k] = idata[k];
+            }
+        }
+
+        __global__ void downSweep(int N, int* idata, int* odata, int depth) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+
+            if ((k + 1) % (1 << depth) == 0) {
+                if ((k + 1) % (1 << (depth + 1)) == 0) {
+                    odata[k] = idata[k - (1 << depth)] + idata[k];
+                }
+                else {
+                    odata[k] = idata[k + (1 << depth)];
+                }
+            }
+            else {
+                odata[k] = idata[k];
+            }
+        }
+
+        __global__ void toInclusive(int N, int* idata, int* odata, int* buf) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+            if (k < N-1){
+                odata[k] = buf[k + 1];
+            }
+            else {
+                odata[k] = buf[k] + idata[k];
+            }
+        }
+
+        __global__ void scatter(int N, int* idata, int* odata, int* buf) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+            if (k == 0 && idata[k] > 0) {
+                odata[k] = idata[k];
+            }
+            else {
+                if (buf[k] > buf[k-1]) {
+                    odata[buf[k] - 1] = idata[k];
+                }
+            }
+            
+            //odata[k] = buf[k];
+        }
+
+        __global__ void binarize(int N, int* idata, int* odata) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+            idata[k] == 0 ? odata[k] = 0 : odata[k] = 1;
+        }
+
+        void zeroArray(int n, int* a) {
+            for (int i = 0; i < n; i++) {
+                a[i] = 0;
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int* odata, const int* idata) {
             timer().startGpuTimer();
-            // TODO
+            int arrLen;
+            int maxDepth = ilog2ceil(n);
+            maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n;
+            zeroArray(arrLen, odata);
+
+            dim3 threadsPerBlock(arrLen / blockSize);
+
+            int* buf = new int[arrLen];
+
+            for (int i = 0; i < arrLen; i++) {
+                if (i < n) {
+                    buf[i] = idata[i];
+                }
+                else {
+                    buf[i] = 0;
+                }
+            }
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, arrLen * sizeof(int));
+            cudaMalloc((void**)&dev_buf, arrLen * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+
+            for (int i = 1; i <= maxDepth; i++) {
+                upSweep << <threadsPerBlock, blockSize >> > (arrLen, dev_buf, dev_odata, i);
+                cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+            }
+
+            cudaMemset(&dev_buf[arrLen - 1], 0, sizeof(int) * 1);
+
+            for (int i = maxDepth - 1; i >= 0; i--) {
+                downSweep << <threadsPerBlock, blockSize >> > (arrLen, dev_buf, dev_odata, i);
+                cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+            }
+
+            toInclusive << <threadsPerBlock, blockSize >> > (arrLen, dev_idata, dev_odata, dev_buf);
+
+            cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+            cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_buf);
+
+
             timer().endGpuTimer();
         }
 
@@ -32,9 +162,67 @@ namespace StreamCompaction {
          */
         int compact(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+            
+            int arrLen;
+            int maxDepth = ilog2ceil(n);
+            maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n;
+            zeroArray(arrLen, odata);
+
+            dim3 threadsPerBlock(arrLen / blockSize);
+
+            int* buf = new int[arrLen];
+
+            for (int i = 0; i < arrLen; i++) {
+                if (i < n) {
+                    buf[i] = idata[i];
+                }
+                else {
+                    buf[i] = 0;
+                }
+            }
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, arrLen * sizeof(int));
+            cudaMalloc((void**)&dev_buf, arrLen * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+
+            binarize << <threadsPerBlock, blockSize >> > (arrLen, dev_idata, dev_odata);
+            cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+
+            for (int i = 1; i <= maxDepth; i++) {
+                upSweep << <threadsPerBlock, blockSize >> > (arrLen, dev_buf, dev_odata, i);
+                cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+            }
+
+            cudaMemset(&dev_buf[arrLen - 1], 0, sizeof(int) * 1);
+
+            for (int i = maxDepth - 1; i >= 0; i--) {
+                downSweep << <threadsPerBlock, blockSize >> > (arrLen, dev_buf, dev_odata, i);
+                cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+            }
+
+            toInclusive << <threadsPerBlock, blockSize >> > (arrLen, dev_idata, dev_odata, dev_buf);
+
+            cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+            int retLen = odata[n - 1];
+
+            cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+
+            scatter << <threadsPerBlock, blockSize >> > (arrLen, dev_idata, dev_odata, dev_buf);
+
+            cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+            cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_buf);
+            
             timer().endGpuTimer();
-            return -1;
+            return retLen;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..521578f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,24 +1,99 @@
+#define GLM_FORCE_CUDA
 #include <cuda.h>
+#include <stdio.h>
+#include <cmath>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <device_functions.h>
 #include "common.h"
 #include "naive.h"
 
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
+
+        #define blockSize 8
+
+        int* dev_idata;
+        int* dev_odata;
+        int* dev_buf;
+
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
 
+        __global__ void kernScan(int N, int* idata, int* odata, int depth) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+
+            if (k >= 1 << (depth - 1)) {
+                odata[k] = idata[k - (1 << (depth - 1))] + idata[k];
+            }
+            else {
+                odata[k] = idata[k];
+            }
+            
+        }
+
+        void zeroArray(int n, int* a) {
+            for (int i = 0; i < n; i++) {
+                a[i] = 0;
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
+
+            int arrLen;
+            int maxDepth = ilog2ceil(n);
+            maxDepth > ilog2(n) ? arrLen = pow(2, maxDepth) : arrLen = n;
+            zeroArray(arrLen, odata);
+
+            dim3 threadsPerBlock(arrLen/blockSize);
+
+            int* buf = new int[arrLen];
+
+            for (int i = 0; i < arrLen; i++) {
+                if (i < n) {
+                    buf[i] = idata[i];
+                }
+                else {
+                    buf[i] = 0;
+                }
+            }
+
             // TODO
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, arrLen * sizeof(int));
+            cudaMalloc((void**)&dev_buf, arrLen * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_odata, odata, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_buf, buf, sizeof(int) * arrLen, cudaMemcpyHostToDevice);
+
+            for (int i = 1; i <= maxDepth; i++) {
+                kernScan << <threadsPerBlock, blockSize >> > (arrLen, dev_buf, dev_odata, i);
+                cudaDeviceSynchronize();
+                cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+            }         
+
+            cudaMemcpy((void**)idata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+            cudaMemcpy(buf, dev_buf, sizeof(int) * arrLen, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata); 
+            cudaFree(dev_buf);
+
             timer().endGpuTimer();
         }
     }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..62b6cbd 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -9,11 +9,30 @@
 namespace StreamCompaction {
     namespace Thrust {
         using StreamCompaction::Common::PerformanceTimer;
+
+        int* dev_idata;
+        int* dev_odata;
+        int* dev_buf;
+
         PerformanceTimer& timer()
         {
             static PerformanceTimer timer;
             return timer;
         }
+
+        __global__ void toInclusive(int N, int* idata, int* odata, int* buf) {
+            int k = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (k >= N) {
+                return;
+            }
+            if (k < N - 1) {
+                odata[k] = buf[k + 1];
+            }
+            else {
+                odata[k] = buf[k] + idata[k];
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
@@ -22,6 +41,21 @@ namespace StreamCompaction {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_buf, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+
+            thrust::device_ptr<int> dev_thrust_idata = thrust::device_ptr<int>(dev_idata);
+            thrust::device_ptr<int> dev_thrust_odata = thrust::device_ptr<int>(dev_odata);
+
+            thrust::inclusive_scan(dev_thrust_idata, dev_thrust_idata+n, dev_thrust_odata);
+            //cudaMemcpy(dev_buf, dev_odata, sizeof(int) * arrLen, cudaMemcpyDeviceToDevice);
+
+            cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
             timer().endGpuTimer();
         }
     }