diff --git a/README.md b/README.md
index 0e38ddb..5d004e0 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,141 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Jiajun Li
 
-### (TODO: Your README)
+Linkedin: [link](https://www.linkedin.com/in/jiajun-li-5063a4217/)
+
+Tested on: Windows 10, i7-12700 @ 2.10GHz, 32GB, RTX3080 12GB
+
+CUDA Compute Capability: 8.6
+
+## **Overview**
+
+In this project, different scan methods and some of scan applications are implemented:
+
+CPU side:
+  1. CPU navie scan
+  2. CPU compaction using CPU navie scan
+  3. CPU navie radix sort using CPU navie scan
+
+GPU side:
+  1. GPU navie scan
+  2. GPU efficient scan with threads reduction
+  3. GPU efficient stream compaction using GPU efficient scan
+
+Full explanations of each method can be found in [GPU Gem 3 Ch 39](https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda).
+
+The project also includes thrust::scan as a benchmark in performance analysis.
+
+## **Project Setup**
+
+This project included the following changes from the original project:
+
+1. Add ```radix_sort.h``` and ```radix_sort.cu``` to ```stream_compaction/CMakeLists```
+
+2. Add radix sort test code in ```src\main.cpp```
+
+## **Output example**
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  45  11  41  13  34  22   1  22   6   5   3   5  21 ...   9   0 ]
+==== cpu scan, power-of-two ====
+  elapsed time: 0.0519ms    (std::chrono Measured)
+    [   0  45  56  97 110 144 166 167 189 195 200 203 208 ... 801487 801496 ]
+==== cpu scan, non-power-of-two ====
+  elapsed time: 0.0519ms    (std::chrono Measured)
+    [   0  45  56  97 110 144 166 167 189 195 200 203 208 ... 801379 801414 ]
+    passed
+==== naive scan, power-of-two ====
+  elapsed time: 0.453632ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+  elapsed time: 0.403456ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+  elapsed time: 0.31744ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+  elapsed time: 0.08192ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+  elapsed time: 0.044032ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+  elapsed time: 0.045056ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   0   3   0   3   0   2   3   1   3   3   1   1 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+  elapsed time: 0.0657ms    (std::chrono Measured)
+    [   1   3   3   2   3   1   3   3   1   1   2   3   1 ...   3   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+  elapsed time: 0.0648ms    (std::chrono Measured)
+    [   1   3   3   2   3   1   3   3   1   1   2   3   1 ...   3   3 ]
+    passed
+==== cpu compact with scan ====
+  elapsed time: 0.1507ms    (std::chrono Measured)
+    [   1   3   3   2   3   1   3   3   1   1   2   3   1 ...   3   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+  elapsed time: 0.17408ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+  elapsed time: 0.171008ms    (CUDA Measured)
+    passed
+
+*****************************
+** RADIX SORT TESTS **
+*****************************
+==== cpu radix sort, power-of-two ====
+  elapsed time: 0.4766ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+==== cpu radix sort, non-power-of-two ====
+  elapsed time: 0.4721ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+```
+
+
+## **Performance Analysis**
+
+In all the following analysis, less time is better.
+
+### **Scan**
+
+![](img/Scan.png)
+
+* Work efficient scan out performs cpu scan when number of elements is greater than 2^16.
+
+* Work efficient scan roughly align with thrust scan when number of elements is greater than 2^18.
+
+* Navie GPU scan is always slower than navie CPU scan. This is because GPU method accesses data trhough global memory, which is considerably costy.
+
+
+### **Stream Compaction**
+
+![](img/StreamCompaction.png)
+
+* Using scan will make it slower in the CPU implementation because scan introduces more iterations over array.
+
+* Work efficient scan starts to out perform cpu scan when number of elements is greater than 2^18.
+
+* For work efficient scan, it performs slightly better when the number of elements is not power of two.
+
+### **Radix Sort**
+
+![](img/RadixSort.png)
+
+### **Future Improvement**
+
+1. Implement parallel radix sort and compare it with navie radix sort.
+
+2. Make GPU scans even more efficient by using share memory.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/RadixSort.png b/img/RadixSort.png
new file mode 100644
index 0000000..5f19786
Binary files /dev/null and b/img/RadixSort.png differ
diff --git a/img/Scan.png b/img/Scan.png
new file mode 100644
index 0000000..eb22bb6
Binary files /dev/null and b/img/Scan.png differ
diff --git a/img/StreamCompaction.png b/img/StreamCompaction.png
new file mode 100644
index 0000000..f24750f
Binary files /dev/null and b/img/StreamCompaction.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..b164274 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,9 +11,10 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix_sort.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -28,6 +29,7 @@ int main(int argc, char* argv[]) {
     printf("****************\n");
 
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -67,18 +69,32 @@ int main(int argc, char* argv[]) {
     //printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
+    zeroArray(SIZE, c);
+    printDesc("naive scan with shared memory, power-of-two");
+    StreamCompaction::Naive::scan2(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, false);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("naive scan with shared memory, non-power-of-two");
+    StreamCompaction::Naive::scan2(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    //printArray(SIZE, c, false);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    //printArray(NPOT, c, false);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -147,6 +163,42 @@ int main(int argc, char* argv[]) {
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+
+    printf("\n");
+    printf("*****************************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("*****************************\n");
+
+    genArray(SIZE - 1, a, 50);
+
+    zeroArray(SIZE, c);
+    printDesc("cpu radix sort, power-of-two");
+    StreamCompaction::Radix_Sort::radix_sort_cpu(SIZE, 6, c, a);
+    count = SIZE;
+    printElapsedTime(StreamCompaction::Radix_Sort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(count, c, true);
+
+    zeroArray(SIZE, c);
+    printDesc("cpu radix sort, non-power-of-two");
+    count = NPOT;
+    StreamCompaction::Radix_Sort::radix_sort_cpu(NPOT, 6, c, a);
+    printElapsedTime(StreamCompaction::Radix_Sort::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(count, c, true);
+
+    zeroArray(SIZE, c);
+    printDesc("parallel radix sort, power-of-two");
+    StreamCompaction::Radix_Sort::radix_sort_parallel(SIZE, 6, c, a);
+    count = SIZE;
+    printElapsedTime(StreamCompaction::Radix_Sort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(count, c, true);
+
+    zeroArray(SIZE, c);
+    printDesc("parallel radix sort, non-power-of-two");
+    count = NPOT;
+    StreamCompaction::Radix_Sort::radix_sort_parallel(NPOT, 6, c, a);
+    printElapsedTime(StreamCompaction::Radix_Sort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(count, c, true);
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 567795b..7b34ba9 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -4,6 +4,7 @@ set(headers
     "naive.h"
     "efficient.h"
     "thrust.h"
+    "radix_sort.h"
     )
 
 set(sources
@@ -12,6 +13,7 @@ set(sources
     "naive.cu"
     "efficient.cu"
     "thrust.cu"
+    "radix_sort.cu"
     )
 
 list(SORT headers)
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..86597ae 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <device_launch_parameters.h>
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -24,6 +25,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (k >= n)
+                return;
+
+            bools[k] = idata[k] > 0 ? 1 : 0;
         }
 
         /**
@@ -33,6 +40,16 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+
+            int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (k >= n)
+                return;
+
+            if (bools[k] > 0)
+            {
+                odata[indices[k]] = idata[k];
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..4125392 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,10 +19,22 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+         
+            doScan(n, odata, idata);
+
             timer().endCpuTimer();
         }
 
+        void doScan(int n, int* odata, const int* idata)
+        {
+            // Exclusive scan
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i)
+            {
+                odata[i] = idata[i - 1] + odata[i - 1];
+            }
+        }
+
         /**
          * CPU stream compaction without using the scan function.
          *
@@ -31,8 +43,18 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+
+            int count = 0;
+            for (int i = 0; i < n; ++i)
+            {
+                if (idata[i] > 0)
+                {
+                    odata[count++] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -41,10 +63,39 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+
+            int* tmpInputData = new int[n];
+            int* tmpOutputData = new int[n];
+
             timer().startCpuTimer();
-            // TODO
+
+            // Transfer idata to 0,1 set
+            for (int i = 0; i < n; ++i)
+            {
+                tmpInputData[i] = idata[i] > 0 ? 1 : 0;
+            }
+
+            // Exclusive scan
+            doScan(n, tmpOutputData, tmpInputData);
+
+            // Final array size
+            int count = tmpOutputData[n - 1] + tmpInputData[n - 1];
+
+            // Scatter
+            for (int i = 0; i < n; ++i)
+            {
+                if (tmpInputData[i] > 0)
+                {
+                    odata[tmpOutputData[i]] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+
+            delete[] tmpInputData;
+            delete[] tmpOutputData;
+          
+            return count;
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..859a092 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -6,6 +6,8 @@ namespace StreamCompaction {
     namespace CPU {
         StreamCompaction::Common::PerformanceTimer& timer();
 
+        void doScan(int n, int* odata, const int* idata);
+
         void scan(int n, int *odata, const int *idata);
 
         int compactWithoutScan(int n, int *odata, const int *idata);
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..b348226 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,6 +1,8 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
 #include "common.h"
+#include <iostream>
 #include "efficient.h"
 
 namespace StreamCompaction {
@@ -12,13 +14,142 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep2(int n, int d, int st, int* odata)
+        {
+            // This uses less threads than kernUpSweep()
+
+            int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            int gap = 1 << d;
+
+            int readIndex = st + 2 * gap * index;
+            int writeIndex = readIndex + gap;
+
+            if (writeIndex < n)
+            {
+                odata[writeIndex] += odata[readIndex];
+            }
+
+            __syncthreads();     
+        }
+
+        __global__ void kernDownSweep2(int n, int d, int* odata)
+        {
+            int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            int gap = 1 << d;
+
+            int rightIndex = n - 1 - 2 * gap * index;
+            int leftIndex = rightIndex - gap;
+
+            if (leftIndex >= 0)
+            {
+                int tmp = odata[leftIndex];
+                odata[leftIndex] = odata[rightIndex];
+                odata[rightIndex] += tmp;
+            }
+
+            __syncthreads();
+        }
+
+        __global__ void kernUpSweep(int n, int d, int* odata)
+        {      
+            int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            int powd = 1 << d;
+            int powd2 = 1 << (d + 1);
+
+            if (k % powd2 == 0)
+            {
+                odata[k + powd2 - 1] += odata[k + powd - 1];
+            }     
+        }
+
+        __global__ void kernDownSweep(int n, int d, int* odata)
+        {
+            int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (k >= n)
+                return;
+
+            int powd = 1 << d;
+            int powd2 = 1 << (d + 1);
+
+            if (k % powd2 == 0)
+            {
+                int tmp = odata[k + powd - 1];
+                odata[k + powd - 1] = odata[k + powd2 - 1];
+                odata[k + powd2 - 1] += tmp;
+            }
+        }  
+
+        void doScan(int n, int* dev_odata)
+        {
+            // Exclusive scan
+
+            int round_n = 1 << ilog2ceil(n);
+
+            int BLOCK_SIZE = 128;
+            dim3 fullBlocksPerGrid((round_n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+            // Up sweep 2 (Halve thread count in each iteration)
+            int st = 0;
+            int round_n_up_sweep = round_n / 2;
+            for (int d = 0; d <= ilog2ceil(round_n) - 1; ++d)
+            {
+                dim3 fullBlocksPerGridUpSweep((round_n_up_sweep + BLOCK_SIZE - 1) / BLOCK_SIZE);
+                kernUpSweep2 <<<fullBlocksPerGridUpSweep, BLOCK_SIZE>>> (round_n, d, st, dev_odata);
+                checkCUDAError("kernUpSweep fails.");
+                st += (1 << d);
+                round_n_up_sweep /= 2;
+            }
+            // Down sweep 2 (Double thread count in each iteration)
+            int round_n_down_sweep = 1;
+            cudaMemset(dev_odata + round_n - 1, 0, sizeof(int));
+            for (int d = ilog2ceil(round_n) - 1; d >= 0; --d)
+            {
+                dim3 fullBlocksPerGridDownSweep((round_n_down_sweep + BLOCK_SIZE - 1) / BLOCK_SIZE);
+                kernDownSweep2 <<<fullBlocksPerGridDownSweep, BLOCK_SIZE>>> (round_n, d, dev_odata);
+                checkCUDAError("kernDownSweep fails.");
+                round_n_down_sweep *= 2;
+            }
+
+            //// Up sweep
+            //for (int d = 0; d <= ilog2ceil(round_n) - 1; ++d)
+            //{                       
+            //    kernUpSweep <<<fullBlocksPerGrid, BLOCK_SIZE>>> (round_n, d, dev_odata);
+            //    checkCUDAError("kernUpSweep fails.");
+            //}
+            //// Down sweep
+            //cudaMemset(dev_odata + round_n - 1, 0, sizeof(int));
+            //for (int d = ilog2ceil(round_n) - 1; d >= 0; --d)
+            //{
+            //    kernDownSweep << <fullBlocksPerGrid, BLOCK_SIZE >> > (round_n, d, dev_odata);
+            //    checkCUDAError("kernDownSweep fails.");
+            //}
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+                      
+            // Round to 'balanced tree'
+            int round_n = 1 << ilog2ceil(n);
+
+            // Device memory setup
+            int* dev_odata;
+            cudaMalloc((void**)&dev_odata, round_n * sizeof(int));
+            cudaMemset(dev_odata, 0, round_n * sizeof(int));
+            cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+  
             timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+            doScan(n, dev_odata);
+            timer().endGpuTimer();   
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_odata);
         }
 
         /**
@@ -31,10 +162,58 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+            int round_n = 1 << ilog2ceil(n);
+
+            // Config
+            int BLOCK_SIZE = 128;
+            dim3 fullBlocksPerGrid((round_n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+            // Device Memory setup
+            int* dev_idata;
+            int* dev_odata_bool;
+            int* dev_odata_scan;
+            
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata_bool, round_n * sizeof(int));
+            cudaMalloc((void**)&dev_odata_scan, round_n * sizeof(int));
+
+            cudaMemset(dev_odata_bool, 0, round_n * sizeof(int));
+            cudaMemset(dev_odata_scan, 0, round_n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            
             timer().startGpuTimer();
-            // TODO
+
+            // Mark 1 0
+            Common::kernMapToBoolean<<<fullBlocksPerGrid, BLOCK_SIZE>>>(round_n, dev_odata_bool, dev_idata);
+            
+            // Scan
+            cudaMemcpy(dev_odata_scan, dev_odata_bool, n * sizeof(int), cudaMemcpyDeviceToDevice);
+            doScan(n, dev_odata_scan);
+
+            // Read total number from scan result
+            int* tmp = new int[2];  
+            cudaMemcpy(tmp, dev_odata_scan + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(tmp + 1, dev_odata_bool + n - 1, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+            int count = tmp[0] + tmp[1];
+            delete[] tmp;
+            
+            // Scatter           
+            int* dev_odata;
+            cudaMalloc((void**)&dev_odata, count * sizeof(int));
+            Common::kernScatter <<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, dev_odata, dev_idata, dev_odata_bool, dev_odata_scan);
+
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_odata);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata_bool);
+            cudaFree(dev_odata_scan);
+      
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..5485f24 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,9 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void doScan(int n, int* dev_odata);
+
+        void scan(int n, int* odata, const int* idata);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..96807fe 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,5 +1,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <iostream>
 #include "common.h"
 #include "naive.h"
 
@@ -11,7 +13,22 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kernScan(int n, int d, int* odata, int *idata)
+        {
+            // Inclusive scan
+
+            int k = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (k >= n)
+                return;
+
+            int powd = 1 << (d - 1); // More efficient than pow()
+            if (k >= powd)
+            {
+                odata[k] = idata[k - powd] + idata[k];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
@@ -19,6 +36,99 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            
+            // Config
+            int BLOCK_SIZE = 512;   // This out performs BLOCK_SIZE = 128 in large data sets
+            dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+            
+            // 2 Buffers
+            int* dev_idata;
+            int* dev_odata;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            // Scan            
+            for (int d = 1; d <= ilog2ceil(n); ++d)
+            {
+                kernScan<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, d, dev_odata, dev_idata);
+                checkCUDAError("kernScan fails.");
+                cudaMemcpy(dev_idata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+            }
+            
+            
+            // Inclusive to exclusive (shift right, insert identity)
+            cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost);
+            odata[0] = 0;
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+
+            timer().endGpuTimer();
+        }
+        
+
+        __global__ void kernScan2(int n, int* odata, int* idata)
+        {
+            // This method should be only used for one block and small data sets
+
+            extern __shared__ int temp[]; // allocated on invocation    
+            int thid = threadIdx.x;
+            int pout = 0, pin = 1;
+
+            // Load input into shared memory.    
+            // This is exclusive scan, so shift right by one
+            // and set first element to 0   
+            temp[pout * n + thid] = (thid > 0) ? idata[thid - 1] : 0;
+            __syncthreads();
+
+            for (int offset = 1; offset < n; offset *= 2)
+            {
+                pout = 1 - pout; // swap double buffer indices     
+                pin = 1 - pout;
+
+                if (thid >= offset)
+                    temp[pout * n + thid] += temp[pin * n + thid - offset];
+                else
+                    temp[pout * n + thid] = temp[pin * n + thid];
+                __syncthreads();
+            }
+
+            odata[thid] = temp[pout * n + thid]; // write output 
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata using shared memory, storing the result into odata.
+         */
+        void scan2(int n, int* odata, const int* idata)
+        {
+            timer().startGpuTimer();
+
+            if (n > 1024)
+            {
+                std::cout << "n > 1024, navie scan with shared memory failed." << std::endl;
+                timer().endGpuTimer();
+                return;
+            }
+
+            int* dev_idata;
+            int* dev_odata;
+
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            kernScan2<<<1, n, 2 * n * sizeof(int)>>>(n, dev_odata, dev_idata);
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+
             timer().endGpuTimer();
         }
     }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 37dcb06..612b00e 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -6,6 +6,10 @@ namespace StreamCompaction {
     namespace Naive {
         StreamCompaction::Common::PerformanceTimer& timer();
 
+        __global__ void kernScan(int n, int d, int* odata, int* idata);
+
         void scan(int n, int *odata, const int *idata);
+
+        void scan2(int n, int* odata, const int* idata);
     }
 }
diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu
new file mode 100644
index 0000000..5204126
--- /dev/null
+++ b/stream_compaction/radix_sort.cu
@@ -0,0 +1,168 @@
+#include <cstdio>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include "radix_sort.h"
+#include "cpu.h"
+#include "efficient.h"
+#include <iostream>
+
+#include "common.h"
+
+namespace StreamCompaction 
+{
+    namespace Radix_Sort 
+    {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+        __device__ __host__ int checkDigit(int num, int whichDigit)
+        {
+            return (num >> whichDigit) & 1;
+        }
+
+        void radix_sort_cpu(int n, int digitMax, int* odata, const int* idata)
+        {
+            int* oArray = new int[n];   // A copy of odata for scattering
+            int* eArray = new int[n];
+            int* fArray = new int[n];
+            int* tArray = new int[n];
+
+            // Init odata
+            memcpy(odata, idata, n * sizeof(int));
+
+            timer().startCpuTimer();
+
+            for (int i = 0; i < digitMax; ++i)
+            {
+                // Save the orignal odata
+                memcpy(oArray, odata, n * sizeof(int));
+
+                // Build eArray
+                for (int j = 0; j < n; ++j)
+                {
+                    eArray[j] = checkDigit(oArray[j], i) == 1 ? 0 : 1;
+                }
+
+                // Build fArray by scaning eArray
+                CPU::doScan(n, fArray, eArray);
+
+                // Scatter data by d
+                int d;
+                int totalFalses = fArray[n - 1] + eArray[n - 1];
+                for (int j = 0; j < n; ++j)
+                {
+                    if (eArray[j] == 0) // b[j] == 1
+                    {
+                        d = j - fArray[j] + totalFalses; // d[j] = t[j]
+                    }
+                    else
+                    {
+                        d = fArray[j];  // d[j] = f[j]
+                    }                 
+                    odata[d] = oArray[j];
+                }
+            }
+
+            timer().endCpuTimer();
+
+            delete[] oArray;
+            delete[] eArray;
+            delete[] fArray;
+            delete[] tArray;            
+        }
+
+
+
+
+        __global__ void kernBuildErrorArray(int n, int whichOne, int* eArray, int* idata)
+        {
+            int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (index >= n)
+                return;
+
+            eArray[index] = 1 - checkDigit(idata[index], whichOne);
+        }
+
+        __global__ void kernSplit(int n, int totalFalses, int* odata, int *idata, int *fArray, int* eArray)
+        {
+            int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+
+            if (index >= n)
+                return;
+
+            int d = -1;
+            if (eArray[index] == 0)
+            {
+                d = index - fArray[index] + totalFalses;
+            }
+            else
+            {
+                d = fArray[index];
+            }
+
+            odata[d] = idata[index];
+        }
+    
+        void radix_sort_parallel(int n, int digitMax, int* odata, int* idata)
+        {
+            int round_n = 1 << ilog2ceil(n);
+
+            // Configuration
+            int BLOCK_SIZE = 128;
+            dim3 fullBlocksPerGrid((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+            int* dev_eArray;
+            int* dev_fArray;
+            int* dev_odata;
+            int* dev_odata2;
+            int* tmp = new int[2];
+
+            cudaMalloc((void**)&dev_eArray, round_n * sizeof(int));
+            cudaMalloc((void**)&dev_fArray, round_n * sizeof(int));
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            cudaMalloc((void**)&dev_odata2, n * sizeof(int));
+       
+            cudaMemset(dev_eArray, 0, round_n * sizeof(int));
+            cudaMemcpy(dev_odata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            
+            timer().startGpuTimer();
+
+            int totalFalses = -1;
+            for (int i = 0; i < digitMax; ++i)
+            {
+                cudaMemcpy(dev_odata2, dev_odata, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+                // Build eArray
+                kernBuildErrorArray<<<fullBlocksPerGrid, BLOCK_SIZE >>>(n, i, dev_eArray, dev_odata2);
+
+                // Build fArray
+                cudaMemcpy(dev_fArray, dev_eArray, round_n * sizeof(int), cudaMemcpyDeviceToDevice);
+                Efficient::doScan(round_n, dev_fArray);
+
+                // Read totalfalses
+                cudaMemcpy(tmp, dev_fArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                cudaMemcpy(tmp + 1, dev_eArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+                totalFalses = tmp[0] + tmp[1];
+
+                // Scatter data
+                kernSplit<<<fullBlocksPerGrid, BLOCK_SIZE>>>(n, totalFalses, dev_odata, dev_odata2, dev_fArray, dev_eArray);
+            }
+
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            delete[] tmp;
+            cudaFree(dev_eArray);
+            cudaFree(dev_fArray);
+            cudaFree(dev_odata);
+            cudaFree(dev_odata2);
+        }
+    }
+}
diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h
new file mode 100644
index 0000000..eefd062
--- /dev/null
+++ b/stream_compaction/radix_sort.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+    namespace Radix_Sort {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        int checkDigit(int num, int whichDigit);
+
+        void radix_sort_parallel(int n, int digitMax, int* odata, int* idata);
+
+        void radix_sort_cpu(int n, int digitMax, int* odata, const int* idata);
+    }
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..928de63 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,25 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            thrust::host_vector<int> host_in;
+
+            for (int i = 0; i < n; ++i)
+            {
+                host_in.push_back(idata[i]);
+            }
+
+            thrust::device_vector<int> dv_in = host_in;
+            thrust::device_vector<int> dv_out(n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+
+            for (int i = 0; i < n; ++i)
+            {
+                odata[i] = dv_out[i];
+            }
         }
     }
 }