From 2a72b3cf8256288facde21c1952db647b9e82518 Mon Sep 17 00:00:00 2001
From: Shixuan Fang <horo-ursa@users.noreply.github.com>
Date: Sun, 18 Sep 2022 18:27:06 -0400
Subject: [PATCH 1/4] finished!

---
 src/main.cpp                   |  58 ++++-
 src/testing_helpers.hpp        |   7 +
 stream_compaction/common.cu    |   8 +
 stream_compaction/cpu.cu       |  51 +++-
 stream_compaction/efficient.cu | 436 ++++++++++++++++++++++++++++++++-
 stream_compaction/efficient.h  |   4 +
 stream_compaction/naive.cu     |  58 +++++
 stream_compaction/thrust.cu    |   8 +-
 8 files changed, 613 insertions(+), 17 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..accaf66 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,15 +13,20 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 26; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
 int *c = new int[SIZE];
+int* sort = new int[SIZE];
+
 
 int main(int argc, char* argv[]) {
     // Scan tests
 
+    //printPowerOf2(SIZE);
+    //printPowerOf2(NPOT);
+
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
@@ -34,6 +39,8 @@ int main(int argc, char* argv[]) {
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
     // At first all cases passed because b && c are all zeroes.
+
+
     zeroArray(SIZE, b);
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
@@ -44,7 +51,7 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, non-power-of-two");
     StreamCompaction::CPU::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, b, true);
+    //printArray(NPOT, b, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -54,11 +61,11 @@ int main(int argc, char* argv[]) {
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
-    printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    ////For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    //onesArray(SIZE, c);
+    //printDesc("1s array for finding bugs");
+    //StreamCompaction::Naive::scan(NPOT, c, a);
+    //printArray(SIZE, c, true); 
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
@@ -66,6 +73,7 @@ int main(int argc, char* argv[]) {
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
+    
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
@@ -81,6 +89,21 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+    //printArray(SIZE, a, true);
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan with shared memory, power-of-two");
+    StreamCompaction::Efficient::scanWithSharedMemory(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan with shared memory, non power-of-two");
+    StreamCompaction::Efficient::scanWithSharedMemory(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
@@ -95,6 +118,20 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+    printf("\n");
+    printf("*****************************\n");
+    printf("** Radix Sort TESTS **\n");
+    printf("*****************************\n");
+
+    zeroArray(SIZE, c);
+    printDesc("Radix sort");
+    StreamCompaction::Efficient::RadixSort(SIZE, c, a);
+    printArray(SIZE, c, true);
+    for (int i = 0; i < SIZE; i++) sort[i] = c[i];
+    std::sort(sort, sort + SIZE);
+    printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printCmpResult(SIZE, sort, c);
+
     printf("\n");
     printf("*****************************\n");
     printf("** STREAM COMPACTION TESTS **\n");
@@ -115,7 +152,7 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedCount = count;
-    printArray(count, b, true);
+    //printArray(count, b, true);
     printCmpLenResult(count, expectedCount, b, b);
 
     zeroArray(SIZE, c);
@@ -123,14 +160,14 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedNPOT = count;
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("cpu compact with scan");
     count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
+    //printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
@@ -147,6 +184,7 @@ int main(int argc, char* argv[]) {
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..15f522e 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -69,6 +69,13 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
+void printPowerOf2(int n) {
+    if ((n & (n - 1)) == 0)
+        printf("%s", "N is a power of 2");
+    else
+        printf("%s", "N is not a power of 2");
+}
+
 template<typename T>
 void printElapsedTime(T time, std::string note = "")
 {
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..6d77b67 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "device_launch_parameters.h"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,9 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            bools[index] = (idata[index] == 0) ? 0 : 1;
         }
 
         /**
@@ -33,6 +37,10 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            if (bools[index] == 1)
+                odata[indices[index]] = idata[index];
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..6bc575e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -3,6 +3,9 @@
 
 #include "common.h"
 
+#include <iostream>
+
+int INCOMPACT = 0;
 namespace StreamCompaction {
     namespace CPU {
         using StreamCompaction::Common::PerformanceTimer;
@@ -18,9 +21,19 @@ namespace StreamCompaction {
          * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startCpuTimer();
-            // TODO
-            timer().endCpuTimer();
+            if (!INCOMPACT) {
+                timer().startCpuTimer();
+                // TODO
+                odata[0] = 0;
+                for (int i = 1; i < n; i++)
+                    odata[i] = odata[i - 1] + idata[i - 1];
+                timer().endCpuTimer();
+            }
+            else {
+                odata[0] = 0;
+                for (int i = 1; i < n; i++)
+                    odata[i] = odata[i - 1] + idata[i - 1];
+            }
         }
 
         /**
@@ -31,8 +44,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[count] = idata[i];
+                    count ++ ;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -41,10 +61,31 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            INCOMPACT = 1;
+            int* copy = new int[n];
+            int* scanResult = new int[n];
             timer().startCpuTimer();
             // TODO
+            //copy data from idata to temporary copy buffer
+            for (int i = 0; i < n; i++) {
+                copy[i] = idata[i] == 0? 0 : 1;
+            }
+            // run exclusive scan on temporary array
+            scan(n, scanResult, copy);
+            //scatter
+            int count = 0;
+            for (int i = 0; i < n; i++) {
+                if (copy[i]) {
+                    count = scanResult[i];
+                    odata[count] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            delete[] copy;
+            delete[] scanResult;
+            return count+1;
+
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..3f97018 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -3,6 +3,14 @@
 #include "common.h"
 #include "efficient.h"
 
+#include <bitset>
+#include "device_launch_parameters.h"
+
+#define NAIVE 1
+#define NUM_BANKS 16 
+#define LOG_NUM_BANKS 4 
+#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) 
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +20,307 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int stride, int* data)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            if (index % stride != 0 || index + stride -1  >= n) return;
+            data[index + stride - 1] += data[index + stride / 2 - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int stride, int* data)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            if (index % stride != 0 || index + stride -1 >= n) return;
+            int temp = data[index + stride / 2 - 1];
+            data[index + stride / 2 - 1] = data[index + stride - 1];
+            data[index + stride - 1] += temp;
+        }
+
+        __global__ void kernOptimizedUpSweep(int n, int d, int offset, int* data)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            if (index < d) {
+                int ai = offset * (2 * index + 1) - 1;
+                int bi = offset * (2 * index + 2) - 1;
+                data[bi] += data[ai];
+            }
+        }
+
+        __global__ void kernOptimizedDownSweep(int n, int d, int offset, int* data)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            if (index < d) {
+                int ai = offset * (2 * index + 1) - 1;
+                int bi = offset * (2 * index + 2) - 1;
+                int t = data[ai];
+                data[ai] = data[bi];
+                data[bi] += t;
+            }
+        }
+
+        //adapted from GPU Gem 3
+        __global__ void kernOptimizedPerBlockScan(int* g_odata, int* g_idata, int n) {
+            // allocated on invocation 
+            extern __shared__ int temp[];
+
+            int thid = threadIdx.x;
+            int blockOffset = blockIdx.x * blockDim.x;
+
+            int index = blockOffset + thid;
+            __syncthreads();
+            temp[thid] = g_idata[index];
+
+            //temp[2 * thid] = g_idata[2 * index];
+            //temp[2 * thid + 1] = g_idata[2 * index + 1];
+
+            // build sum in place up the tree 
+            int offset = 1;
+            for (int d = n >> 1; d > 0; d >>= 1)
+            {
+                __syncthreads();
+                if (thid < d) {
+                    //int ai = offset * (2 * thid + 1) - 1; 
+                    //int bi = offset * (2 * thid + 2) - 1; 
+                    //ai += CONFLICT_FREE_OFFSET(ai);
+                    //bi += CONFLICT_FREE_OFFSET(bi);
+
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+
+                    temp[bi] += temp[ai];
+                }
+                offset *= 2;
+            }
+            // clear the last element
+            if (thid == 0) { temp[n-1] = 0; }
+            // traverse down tree & build scan 
+            for (int d = 1; d < n; d *= 2)
+            {
+                offset >>= 1;
+                __syncthreads();
+                if (thid < d) {
+
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    int t = temp[ai];
+                    temp[ai] = temp[bi];
+                    temp[bi] += t;
+                }
+            }
+            
+            //g_odata[2 * thid + blockOffset] = temp[2 * thid];
+            //g_odata[2 * thid + 1 + blockOffset] = temp[2 * thid + 1];
+
+            //g_odata[2 * index] = temp[2 * thid];
+            //g_odata[2 * index + 1] = temp[2 * thid + 1];
+            __syncthreads();
+            g_odata[index] = temp[thid];
+        }
+
+        __global__ void kernWriteSumArray(int* sum, int* odata)
+        {
+            int index = threadIdx.x;
+            if (index == blockDim.x - 1)
+                sum[blockIdx.x] = odata[(blockIdx.x + 1) * blockDim.x - 1];
+        }
+
+        __global__ void kernAddIncrement(int* outdata, int* sumArrayCopy)
+        {
+            int index = threadIdx.x;
+            //for (int i = 0; i < blockDim.x; i++) {
+            outdata[blockDim.x * blockIdx.x + index] += sumArrayCopy[blockIdx.x];
+            //}
+        }
+
+        __global__ void kernMakeInclusive(int* inclusive, int* odata, int* idata)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (threadIdx.x == blockDim.x - 1) {
+                inclusive[index] = odata[index] + idata[index];
+            }
+            else {
+                inclusive[index] = odata[index + 1];
+            }
+        }
+
+        __global__ void kernMakeExclusive(int* odata, int* idata)
+        {
+            int index = blockDim.x * blockIdx.x + threadIdx.x;
+            if (index == 0)
+                odata[index] = 0;
+            else
+                odata[index] = idata[index - 1];
+        }
+        
+        void scanInclusive(int n, int* odata, const int* idata)
+        {
+            //extend to pow of 2
+            int Num = 1 << ilog2ceil(n);
+            int blockSize = 128;
+            int blockNum = (Num + blockSize - 1) / blockSize;
+
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, Num * sizeof(int));
+            cudaMemset(dev_data, 0, Num * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            int* dev_inclusive;
+            cudaMalloc((void**)&dev_inclusive, n * sizeof(int));
+
+            int offset = 1;
+            for (int d = Num >> 1; d > 0; d >>= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                kernOptimizedUpSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+                offset <<= 1;
+            }
+            cudaDeviceSynchronize();
+            //get the last sum for inclusive scan
+            int lastSum = 0;
+            cudaMemcpy(&lastSum, dev_data + Num - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaMemset(dev_data + Num - 1, 0, sizeof(int));
+            for (int d = 1; d < Num; d <<= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                offset >>= 1;
+                kernOptimizedDownSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+            }
+            cudaMemcpy(odata, dev_inclusive, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+
+        void scanWithoutTimer(int n, int* odata, const int* idata)
+        {
+            int Num = 1 << ilog2ceil(n);
+            int blockSize = 128;
+            int blockNum = (Num + blockSize - 1) / blockSize;
+
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, Num * sizeof(int));
+            cudaMemset(dev_data, 0, Num * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int offset = 1;
+            for (int d = Num >> 1; d > 0; d >>= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                kernOptimizedUpSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+                offset <<= 1;
+            }
+            cudaDeviceSynchronize();
+            cudaMemset(dev_data + Num - 1, 0, sizeof(int));
+            for (int d = 1; d < Num; d <<= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                offset >>= 1;
+                kernOptimizedDownSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+            }
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+
+        void scanWithSharedMemory(int n, int* odata, const int* idata)
+        {
+            int Num = 1 << ilog2ceil(n);
+            int blockSize = 64;
+            if (Num <= 512)
+                blockSize = Num/2;
+            else
+                blockSize = 256;
+            int blockNum = Num  / blockSize;
+
+            int* inData;
+            int* outData;
+            int* inclusiveData;
+            cudaMalloc((void**)&inData, Num * sizeof(int));
+            cudaMalloc((void**)&outData, Num * sizeof(int));
+            cudaMalloc((void**)&inclusiveData, Num * sizeof(int));
+            cudaMemset(inData, 0, Num * sizeof(int));
+            cudaMemcpy(inData, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int* sumArray;
+            cudaMalloc((void**)&sumArray, blockNum * sizeof(int));
+            int* sumArrayCopy;
+            cudaMalloc((void**)&sumArrayCopy, blockNum * sizeof(int));
+
+
+            timer().startGpuTimer();
+            //run scan on each block
+            kernOptimizedPerBlockScan<<<blockNum, blockSize, blockSize * sizeof(int)>>> (outData, inData, blockSize);
+            cudaDeviceSynchronize();
+
+            //cudaMemcpy(odata, outData, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            kernMakeInclusive << <blockNum, blockSize >> > (inclusiveData, outData, inData);
+
+            //write total sum of each block into a new array
+            kernWriteSumArray << <blockNum, blockSize >> > (sumArray, inclusiveData);
+            cudaDeviceSynchronize();
+
+            //exclusive scan
+            scanWithoutTimer(blockNum, sumArrayCopy, sumArray);
+            cudaDeviceSynchronize();
+
+            //add block increment
+            kernAddIncrement << <blockNum, blockSize >> > (inclusiveData, sumArrayCopy);
+            cudaDeviceSynchronize();
+
+            timer().endGpuTimer();
+
+            kernMakeExclusive << <blockNum, blockSize >> > (inData, inclusiveData);
+            
+            cudaMemcpy(odata, inData, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(inData);
+            cudaFree(outData);
+            cudaFree(inclusiveData);
+            cudaFree(sumArray);
+            cudaFree(sumArrayCopy);
+        }
+
+        
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            //extend to pow of 2
+            int Num = 1 << ilog2ceil(n);
+            int blockSize = 128;
+            int blockNum = (Num + blockSize - 1) / blockSize;
+            
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, Num * sizeof(int));
+            cudaMemset(dev_data, 0, Num * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int offset = 1;
             timer().startGpuTimer();
             // TODO
+            ////up-sweep
+            //for (int i = 0; i <= ilog2ceil(Num); i++) {
+            //    kernUpSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
+            //}
+            // cudaDeviceSynchronize()
+            // cudaMemset(dev_data + Num - 1, 0, sizeof(int));
+            ////down-sweep
+            //for (int i = ilog2ceil(Num); i >= 0; i--) {
+            //    kernDownSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
+            //}
+            for (int d = Num >> 1; d > 0; d >>= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                kernOptimizedUpSweep << <blockNum, blockSize >> > (Num,d, offset, dev_data);
+                offset <<= 1;
+            }
+            cudaDeviceSynchronize();
+            cudaMemset(dev_data + Num - 1, 0, sizeof(int));
+            for (int d = 1; d < Num; d <<= 1) {
+                blockNum = (d + blockSize - 1) / blockSize;
+                offset >>= 1;
+                kernOptimizedDownSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+            }
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
         }
 
         /**
@@ -31,10 +333,142 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int Num = pow(2, ilog2ceil(n));
+
+            int* dev_bools;
+            int* dev_scan_Result;
+            int* dev_dataCopy;
+            int* dev_ScatterResult;
+            cudaMalloc((void**)&dev_dataCopy, Num * sizeof(int));
+            cudaMalloc((void**)&dev_bools, Num * sizeof(int));
+            cudaMalloc((void**)&dev_scan_Result, Num * sizeof(int));
+            cudaMalloc((void**)&dev_ScatterResult, Num * sizeof(int));
+            
+            cudaMemset(dev_dataCopy, 0, Num * sizeof(int));
+            cudaMemcpy(dev_dataCopy, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            dim3 blockSize(128);
+            dim3 blockNum((n + blockSize.x - 1) / blockSize.x);
+
+
             timer().startGpuTimer();
             // TODO
+            StreamCompaction::Common::kernMapToBoolean <<<blockNum, blockSize>>>
+                (Num, dev_bools, dev_dataCopy);
+            cudaDeviceSynchronize();
+            //make a copy of bools data
+            cudaMemcpy(dev_scan_Result, dev_bools, Num * sizeof(int), cudaMemcpyDeviceToDevice);
+
+            //sweep up
+            for (int i = 0; i <= ilog2ceil(Num); i++) {
+                kernUpSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_scan_Result);
+            }
+            cudaDeviceSynchronize();
+
+            //set dev_scan_Result[n-1] = 0?
+            cudaMemset(dev_scan_Result + Num - 1, 0, sizeof(int));
+
+            //down-sweep
+            for (int i = ilog2ceil(Num); i >= 0; i--) {
+                kernDownSweep <<<blockNum, blockSize >> > (Num, pow(2, i + 1), dev_scan_Result);
+            }
+            cudaDeviceSynchronize();
+            //scatter
+            StreamCompaction::Common::kernScatter << <blockNum, blockSize >> >
+                (Num, dev_ScatterResult, dev_dataCopy, dev_bools, dev_scan_Result);
+
             timer().endGpuTimer();
-            return -1;
+
+            //get num of result
+            int num = 0;
+            cudaMemcpy(&num, dev_scan_Result + Num - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_ScatterResult, num * sizeof(int), cudaMemcpyDeviceToHost);
+
+            
+            cudaFree(dev_bools);
+            cudaFree(dev_dataCopy);
+            cudaFree(dev_scan_Result);
+            cudaFree(dev_ScatterResult);
+            
+            return num;
+        }
+
+        __global__ void kernBittoBool(int n, int whichbit, int* idata, int* b, int* e)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            int bit = (idata[index] >> whichbit) & 1;
+            b[index] = bit;
+            e[index] = !bit;
+        }
+
+        __global__ void kernComputeTarray(int n, int totalFalse, int* f, int* t)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            t[index] = index - f[index] + totalFalse;
+        }
+
+        __global__ void kernComputeDarray(int n, int* b, int* t, int* f, int* d)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            d[index] = b[index] ? t[index] : f[index];
         }
+
+        __global__ void kernScatterToOutput(int n, int* d, int* idata, int* odata)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            odata[d[index]] = idata[index];
+        }
+
+        void RadixSort(int n, int *odata, const int* idata )
+        {
+            int maxElement = *(std::max_element(idata, idata + n));
+            int bitNum = ilog2ceil(maxElement);
+            int* dev_idata;
+            int* dev_b;
+            int* dev_e;
+            int* dev_f;
+            int* dev_t;
+            int* dev_d;
+            int* dev_output;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_b, n * sizeof(int));
+            cudaMalloc((void**)&dev_e, n * sizeof(int));
+            cudaMalloc((void**)&dev_f, n * sizeof(int));
+            cudaMalloc((void**)&dev_t, n * sizeof(int));
+            cudaMalloc((void**)&dev_d, n * sizeof(int));
+            cudaMalloc((void**)&dev_output, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int blkSize = 128;
+            int blkNum = (n + blkSize - 1) / blkSize;
+
+            int totalFalse = 0;
+            timer().startGpuTimer();
+            for (int i = 0; i < bitNum; i++) {
+                //create dev_b and dev_e
+                kernBittoBool << <blkNum, blkSize >> > (n,i, dev_idata, dev_b, dev_e);
+                //scan to create dev_f
+                scanWithoutTimer(n, dev_f, dev_e);
+                //compute totalFalse --> should use memcpy
+                int e, f;
+                cudaMemcpy(&e, dev_e + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                cudaMemcpy(&f, dev_f + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                totalFalse = e + f;
+                //compute t array
+                kernComputeTarray << < blkNum, blkSize >> > (n, totalFalse, dev_f, dev_t);
+                //scatter
+                kernComputeDarray << < blkNum, blkSize >> > (n, dev_b, dev_t, dev_f, dev_d);
+                kernScatterToOutput << <blkNum, blkSize >> > (n, dev_d, dev_idata, dev_output);
+                std::swap(dev_output, dev_idata);
+            }
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+        }
+
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..54a2185 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -9,5 +9,9 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata);
 
         int compact(int n, int *odata, const int *idata);
+
+        void RadixSort(int n, int* odata, const int* idata);
+        void scanWithSharedMemory(int n, int* odata, const int* idata);
+        void scanWithoutTimer(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..5187297 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#include "device_launch_parameters.h"
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,14 +14,70 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void kernNaiveScan(int d, int n, int* odata, int* idata)
+        {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) return;
+            int pow2 = 1 << (d -1);
+            if (index >= pow2) {
+                odata[index] = idata[index - pow2] + idata[index];
+            }
+            else {
+                odata[index] = idata[index];
+            }
+        }
+
+        __host__ bool isPowerof2(int n) {
+            return (n & (n - 1)) == 0;
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_array1 = nullptr;
+            int* dev_array2 = nullptr;
+            int Num = 1 << ilog2ceil(n);
+
+            cudaMalloc((void**)&dev_array1, Num * sizeof(int));
+            checkCUDAErrorFn("malloc dev_array1 failed.");
+            cudaMalloc((void**)&dev_array2, Num * sizeof(int));
+            checkCUDAErrorFn("malloc dev_array2 failed.");
+
+            //cudaMalloc((void**)&dev_array1, n * sizeof(int));
+            //checkCUDAErrorFn("malloc dev_array1 failed.");
+            //cudaMalloc((void**)&dev_array2, n * sizeof(int));
+            //checkCUDAErrorFn("malloc dev_array2 failed.");
+
+            cudaMemset(dev_array1, 0, Num * sizeof(int));
+            checkCUDAErrorFn("memset dev_array1 to 0 failed.");
+            cudaMemset(dev_array2, 0, Num * sizeof(int));
+            checkCUDAErrorFn("memset dev_array2 to 0 failed.");
+
+            cudaMemcpy(dev_array1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorFn("memcopy idata to dev_array1 failed.");
+
+            dim3 blocksize(512);
+            dim3 blockCount = (n + 512 - 1) / 512;
+
+            
+
             timer().startGpuTimer();
             // TODO
+            for (int i = 1; i <= ilog2ceil(Num); i++) {
+                kernNaiveScan <<<blockCount, blocksize >>> (i, Num, dev_array2, dev_array1);
+                cudaDeviceSynchronize();
+                std::swap(dev_array1, dev_array2);
+            }
+
             timer().endGpuTimer();
+            cudaMemcpy(odata+1, dev_array1, (n-1) * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("memcopy dev_array1 to odata+1 failed.");
+            odata[0] = 0;
+            cudaFree(dev_array1);
+            checkCUDAErrorFn("free dev_array1 failed.");
+            cudaFree(dev_array2);
+            checkCUDAErrorFn("free dev_array2 failed.");
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..2b5c325 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,17 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int num = 1 << ilog2ceil(n);
+            thrust::device_vector<int> dv_in(idata, idata + num);
+            thrust::device_vector<int> dv_out(num);
+
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            timer().endGpuTimer();
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
-            timer().endGpuTimer();
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }

From 591d0d5513fcb77f13eb7fc4a34c00b57ca6b9ce Mon Sep 17 00:00:00 2001
From: Shixuan Fang <shixuanf@usc.edu>
Date: Sun, 18 Sep 2022 21:43:40 -0400
Subject: [PATCH 2/4] Update README.md

---
 README.md | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 166 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb..cfdd2fd 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,172 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Shixuan Fang
+  * [LinkedIn](https://www.linkedin.com/in/shixuan-fang-4aba78222/)
+* Tested on: Windows 11, i7-12700k, RTX3080Ti (Personal Computer)
 
-### (TODO: Your README)
+# Overview
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+This project is mainly about **Parallel Scan(prefix sum)** and **Stream Compaction** algorithms implemented with CUDA. Scan is about computing the prefix sum of an array, and stream compaction is about deleting all elements in an array that meet certain condition. These algorithms seem to be inherently sequential at the first glance, but with GPU we can convert these algorithms into very efficient parallel algorithms.
 
+One application of Parallel Scan is Summed Area Table, which is a very important algorithm real-time rendering, especially for pre-computation. Another one is Radix Sort, which is a sorting algorithm that can run in parallel. Stream Compaction is very important in ray tracing, which can help delete unnecessory rays.
+
+# Description
+In this project, I mainly implemented these algorithms:
+- Naive GPU Scan & Stream compaction
+- Naive GPU Scan 
+- Optimized Efficient GPU Scan & Stream Compaction (which break the scanning process into up-sweep and down-sweep stages)
+- GPU Scan with Thrust Library
+- More-Efficient GPU Scan with dynamic number of blocks and threads (extra credit)
+- GPU Scan Using Shared Memory && Hardware Optimization (extra credit)
+- Radix Sort (extra credit)
+
+# Output
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  37   0  28  30  42  12  17  24  24  11   7  27   6 ...  23   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 53.0417ms    (std::chrono Measured)
+==== cpu scan, non-power-of-two ====
+   elapsed time: 53.5831ms    (std::chrono Measured)
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 9.51712ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 9.58666ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 8.21565ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 7.78179ms    (CUDA Measured)
+    passed
+==== optimized work-efficient scan, power-of-two ====
+   elapsed time: 4.05126ms    (CUDA Measured)
+    passed
+==== optimized work-efficient scan, non-power-of-two ====
+   elapsed time: 4.13146ms    (CUDA Measured)
+    passed
+==== work-efficient scan with shared memory, power-of-two ====
+   elapsed time: 1.93226ms    (CUDA Measured)
+    passed
+==== work-efficient scan with shared memory, non power-of-two ====
+   elapsed time: 1.75846ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.15917ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.641792ms    (CUDA Measured)
+    passed
+
+*****************************
+** Radix Sort TESTS **
+*****************************
+==== Radix sort ====
+   elapsed time: 0.641792ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   1   2   1   3   1   2   2   2   2   3   3   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 58.6994ms    (std::chrono Measured)
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 58.559ms    (std::chrono Measured)
+    passed
+==== cpu compact with scan ====
+   elapsed time: 146.589ms    (std::chrono Measured)
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 11.0158ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 10.992ms    (CUDA Measured)
+    passed
+```
+
+
+# Performance Analysis
+
+- Scan runtime with different block size with array size = 2^25.
+
+![Performance with different block size (1)](https://user-images.githubusercontent.com/54868517/190933826-32efa973-fad8-4ba4-9508-5a614bd00540.png)
+
+
+As seen in this graph, **blockSize = 256** has the best performance for Naive, Efficient, and Efficient with shared memory, therefore these blockSize are set to **256**, **blockSize = 128** has the best performance for Optimized Efficient, which is then set to **128**
+
+
+- Compare all of these GPU Scan implementations to CPU version of Scan 
+
+![17d41e321d148779ee125d48673dcc7](https://user-images.githubusercontent.com/54868517/190935115-3d188d84-bad3-4d8c-b0e6-bf09634840b1.jpg)
+
+- In this graph, array size starts with 2^20 and ends with 2^28; generally, GPU scan algorithms are faster than CPU serial scan with array size larger than 2^16 due to great amount of parallel computing.
+- As seen clearly in this graph, other than thrust library, Efficient scan with shared memory performed the best, which is much faster than CPU scan, even cost only half time compared with Optimized Efficient scan. This implies that **global memory accessing** is one of the biggest performance bottleneck.
+- We also noticed that the Optimized Efficient Scan(with dynamic block numer/grid size) outperform the naive Efficient Scan. This implies that **the number of blocks that are launched in a kernel do affect performance**. When we update the number of blocks in Up-Sweep stage and Down-Sweep stage dynamically, the kernels can run faster.
+- Thust::exclusive_scan is always the fastest method, which is even much master than Efficient Scan with shared memory.
+ - As seen in the following screenshot from Nsight Timeline, the thrust api has ```cudaMemcpyAsync()``` and ``` cudaStreamSynchronize``` functions which are not used in my implementation. According to a stackoverflow post, ```cudaMemcpyAsync()``` will **returns control to the host immediately (just like a kernel call does) rather than waiting for the data copy to be completed.** I assume this will greatly improve the performance since we can do something else while cpu is copying data.
+![be2a7736d65f0a09e21ce97efc51d27](https://user-images.githubusercontent.com/54868517/190936662-7465d555-925c-4a09-aa21-9e67cc1f1aea.jpg)
+
+# Extra Credit
+
+**1. Optimized GPU Efficient Scan**
+```
+__global__ void kernUpSweep(int n, int stride, int* data)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n) return;
+    if (index % stride != 0 || index + stride -1  >= n) return;
+    data[index + stride - 1] += data[index + stride / 2 - 1];
+}
+```
+For both kernUpSweep and kernDownSweep kernels used in the basic scan function, there are ```%``` operations, which are very slow to compute in GPU. After optimization, the new KernOptimizedUpSweep looks like this:
+```
+__global__ void kernOptimizedUpSweep(int n, int d, int offset, int* data)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index >= n) return;
+    if (index < d) {
+        int ai = offset * (2 * index + 1) - 1;
+        int bi = offset * (2 * index + 2) - 1;
+        data[bi] += data[ai];
+    }
+}
+```
+This simple change actually improved the performance a lot. 
+Another part I've changed is how many blocked are launched, this is achieved by shrinking block number by two in the for loop during up sweep stage, and expand by two during the down sweep stage.
+```
+for (int d = Num >> 1; d > 0; d >>= 1) {
+    blockNum = (d + blockSize - 1) / blockSize;
+    kernOptimizedUpSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
+    offset <<= 1;
+}
+```
+
+**2. GPU Efficient Scan with shared memory**
+
+This is mainly achieve by the kernal ```kernOptimizedPerBlockScan```, which is adapted from GPU Gem 3 Ch 39. 
+The source code in GPU Gem 3 only works with 1 block, so I changed it to allow multiple blocks run this algorithm in parallel and then implemented the algorithm showed in class slide.
+
+**3. Radix Sort**
+
+I've also implemented Radix Sort, which can be see in ```StreamCompaction::Efficient::RadixSort()```
+
+Here is an example of how I tested it.
+```
+** Radix Sort TESTS **
+*****************************
+    [  43  34  35  24  32   8  19   0 ]
+==== Radix sort ====
+    [   0   8  19  24  32  34  35  43 ]
+==== Result from std::sort ====
+    [   0   8  19  24  32  34  35  43 ]
+   elapsed time: 0.493472ms    (CUDA Measured)
+    passed
+```

From dbf5a8eb30b859fc37af63eb9f05c0c4e99f040a Mon Sep 17 00:00:00 2001
From: Shixuan Fang <shixuanf@usc.edu>
Date: Sun, 18 Sep 2022 21:44:16 -0400
Subject: [PATCH 3/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cfdd2fd..1567399 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ One application of Parallel Scan is Summed Area Table, which is a very important
 
 # Description
 In this project, I mainly implemented these algorithms:
-- Naive GPU Scan & Stream compaction
+- Naive CPU Scan & Stream compaction
 - Naive GPU Scan 
 - Optimized Efficient GPU Scan & Stream Compaction (which break the scanning process into up-sweep and down-sweep stages)
 - GPU Scan with Thrust Library

From af41ae6741494df2377b8561599edbb2540be28b Mon Sep 17 00:00:00 2001
From: Shixuan Fang <horo-ursa@users.noreply.github.com>
Date: Sun, 18 Sep 2022 22:21:30 -0400
Subject: [PATCH 4/4] little fix

---
 src/main.cpp                   | 23 ++++++++++++----
 stream_compaction/efficient.cu | 50 ++++++++++++++++++++++++++--------
 stream_compaction/efficient.h  |  1 +
 stream_compaction/naive.cu     |  4 +--
 4 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index accaf66..b4e0037 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 26; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -45,7 +45,7 @@ int main(int argc, char* argv[]) {
     printDesc("cpu scan, power-of-two");
     StreamCompaction::CPU::scan(SIZE, b, a);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
+    //printArray(SIZE, b, true);
 
     zeroArray(SIZE, c);
     printDesc("cpu scan, non-power-of-two");
@@ -89,7 +89,21 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
-    //printArray(SIZE, a, true);
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, power-of-two");
+    StreamCompaction::Efficient::scanOptimized(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, non-power-of-two");
+    StreamCompaction::Efficient::scanOptimized(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+    ////printArray(SIZE, a, true);
     zeroArray(SIZE, c);
     printDesc("work-efficient scan with shared memory, power-of-two");
     StreamCompaction::Efficient::scanWithSharedMemory(SIZE, c, a);
@@ -97,7 +111,7 @@ int main(int argc, char* argv[]) {
     //printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    zeroArray(SIZE, c);
+    //zeroArray(SIZE, c);
     printDesc("work-efficient scan with shared memory, non power-of-two");
     StreamCompaction::Efficient::scanWithSharedMemory(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
@@ -126,7 +140,6 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("Radix sort");
     StreamCompaction::Efficient::RadixSort(SIZE, c, a);
-    printArray(SIZE, c, true);
     for (int i = 0; i < SIZE; i++) sort[i] = c[i];
     std::sort(sort, sort + SIZE);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 3f97018..863044a 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -285,7 +285,7 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             //extend to pow of 2
             int Num = 1 << ilog2ceil(n);
-            int blockSize = 128;
+            int blockSize = 256;
             int blockNum = (Num + blockSize - 1) / blockSize;
             
             int* dev_data;
@@ -296,19 +296,37 @@ namespace StreamCompaction {
             int offset = 1;
             timer().startGpuTimer();
             // TODO
-            ////up-sweep
-            //for (int i = 0; i <= ilog2ceil(Num); i++) {
-            //    kernUpSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
-            //}
-            // cudaDeviceSynchronize()
-            // cudaMemset(dev_data + Num - 1, 0, sizeof(int));
-            ////down-sweep
-            //for (int i = ilog2ceil(Num); i >= 0; i--) {
-            //    kernDownSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
-            //}
+            //up-sweep
+            for (int i = 0; i <= ilog2ceil(Num); i++) {
+                kernUpSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
+            }
+            cudaDeviceSynchronize();
+            cudaMemset(dev_data + Num - 1, 0, sizeof(int));
+            //down-sweep
+            for (int i = ilog2ceil(Num); i >= 0; i--) {
+                kernDownSweep << <blockNum, blockSize >> > (Num, pow(2, i + 1), dev_data);
+            }
+            
+            timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
+        }
+
+        void scanOptimized(int n, int* odata, const int* idata) {
+            int Num = 1 << ilog2ceil(n);
+            int blockSize = 128;
+            int blockNum = (Num + blockSize - 1) / blockSize;
+
+            int* dev_data;
+            cudaMalloc((void**)&dev_data, Num * sizeof(int));
+            cudaMemset(dev_data, 0, Num * sizeof(int));
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            int offset = 1;
+            timer().startGpuTimer();
             for (int d = Num >> 1; d > 0; d >>= 1) {
                 blockNum = (d + blockSize - 1) / blockSize;
-                kernOptimizedUpSweep << <blockNum, blockSize >> > (Num,d, offset, dev_data);
+                kernOptimizedUpSweep << <blockNum, blockSize >> > (Num, d, offset, dev_data);
                 offset <<= 1;
             }
             cudaDeviceSynchronize();
@@ -468,6 +486,14 @@ namespace StreamCompaction {
             }
             timer().endGpuTimer();
             cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_b);
+            cudaFree(dev_e);
+            cudaFree(dev_f);
+            cudaFree(dev_t);
+            cudaFree(dev_d);
+            cudaFree(dev_output);
         }
 
     }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 54a2185..715b85d 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -13,5 +13,6 @@ namespace StreamCompaction {
         void RadixSort(int n, int* odata, const int* idata);
         void scanWithSharedMemory(int n, int* odata, const int* idata);
         void scanWithoutTimer(int n, int* odata, const int* idata);
+        void scanOptimized(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 5187297..0ebc12e 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -57,8 +57,8 @@ namespace StreamCompaction {
             cudaMemcpy(dev_array1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             checkCUDAErrorFn("memcopy idata to dev_array1 failed.");
 
-            dim3 blocksize(512);
-            dim3 blockCount = (n + 512 - 1) / 512;
+            int blocksize = 256;
+            int blockCount = (n + blocksize - 1) / blocksize;