diff --git a/CMakeLists.txt b/CMakeLists.txt
index d3d976c..0c5e133 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,10 +86,4 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
     ${CORELIBS}
     )
 
-add_custom_command(
-    TARGET ${CMAKE_PROJECT_NAME}
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory
-        ${CMAKE_SOURCE_DIR}/shaders
-        ${CMAKE_BINARY_DIR}/shaders
-    )
+
diff --git a/README.md b/README.md
index 110697c..2dcb27f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,99 @@
 CUDA Path Tracer
 ================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yu Sun 
+* [LinkedIn](https://www.linkedin.com/in/yusun3/)
+* Tested on: Tested on: Windows 10 , i7-6700HQ CPU @ 2.60GHz × 8 , GeForce GTX 960M/PCIe/SSE2, 7.7GB Memory (Personal Laptop)
 
-### (TODO: Your README)
+## Introduction
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+In this project, a basic path tracer with static scene rendering and shading is implemented. The tracer can shade objects with 
+diffusive, specular and refractive surfaces with light source in the scene. 
+
+Below is a very simple scene demonstrating the effect.
+![](img/scene.png)
+![](img/closed.png)
+
+The techniques used in order to make the shading process faster include caching the first bounces, sorting the materials by id, and 
+stream compaction. Caching can help because the first bounce of the rays would almost always be the same. Sorting the materials by id would help the 
+access pattern of the threads, and stream compaction essentially reduce the number of computation that needs to be done. 
+
+Additional features written include anti-aliasing, using depth of field and motion blur.
+
+***Anti-aliasing*** 
+
+Anti-aliasing is a very simple technique that you add some random gaussian noise when shooting the rays so the boundary looks more natural. 
+
+An example of scene with and without anti-aliasing effect is shown below, look how sharp the edge look on scene without anti-aliasing effect
+
+Scene with anti-aliasing       | Scene without anti-aliasing
+:-------------------------:|:-------------------------:
+![](img/white.png)  |  ![](img/alias.png)
+
+
+***Depth of Field*** 
+
+Depth of field can be achieved by specifying a camera model with varying focal length and lens size. It uses a technique that's called concentric disc sampling and 
+is explained in detail in the book Physically Based Rendering. The result is shown below:
+
+Focal Length = 10      | Focal Length = 15
+:-------------------------:|:-------------------------:
+![](img/focal10.png)  |  ![](img/focal15.png)
+
+LenSize = 0.2      | LenSize = 0.5
+:-------------------------:|:-------------------------:
+![](img/lensize.png)  |  ![](img/focal15.png)
+
+
+***Motion Blur***
+
+Motion blur can be achieved by sampling the objects in different locations while it moves. The movement speed will effect how the object is captured by the camera.
+
+Speed = 1      | Speed = 2
+:-------------------------:|:-------------------------:
+![](img/speed1.png)  |  ![](img/speed2.png)
+
+
+
+***Refraction*** 
+
+Refraction effect can be achieved by using Snell's law. It helps the object looks more realistic with the effect of letting light through 
+
+Without refraction      | With Refraction
+:-------------------------:|:-------------------------:
+![](img/no_refract.png)  |  ![](img/scene.png)
+
+
+## Analysis 
+
+So I mention briefly that stream compaction can help reduce the amount of computation we have. Below is a more concreate analysis on the number of rays reduced after stream compaction. Notice the difference when the scene is closed and when it's open. 
+
+![](img/sc.png)
+
+This makes sense as in closed scene the rays needs to bounce more before hitting a light source and terminating, while in open scene it can soon be terminated as it's shooting outside.
+
+It can also be seen that in general the algorithm runs faster when it's in open scene. 
+
+![](img/time.png)
+
+It is also true that using cache helps improving the speed as shown below. 
+
+However, when we need to go with randomrization in the scene creating anti-aliasing effect or motion blur. The technique cannot be used anymore. 
+
+On the other hand, we could still use sorting to help increase the speed. However, although in theory sorting can help increase the performance. In reality, 
+one needs to consider its overhead as sorting does come with a price.
+
+Closed Scene      | Open scene 
+:-------------------------:|:-------------------------:
+![](img/sort_closed.png)  |  ![](img/sort_open.png)
+
+In general, it's more helpful to use sorting in open scene as in closed scene, the rays are rarely terminating. Therefore, the gain of making the memory access continuous 
+may not be that much. 
+
+## References
+
+* [PBRT] Physically Based Rendering, Second Edition: From Theory To Implementation. Pharr, Matt and Humphreys, Greg. 2010.
+* Wikipedia 
 
diff --git a/img/alias.png b/img/alias.png
new file mode 100644
index 0000000..95615a1
Binary files /dev/null and b/img/alias.png differ
diff --git a/img/cache.png b/img/cache.png
new file mode 100644
index 0000000..16ce36a
Binary files /dev/null and b/img/cache.png differ
diff --git a/img/closed.png b/img/closed.png
new file mode 100644
index 0000000..0853521
Binary files /dev/null and b/img/closed.png differ
diff --git a/img/cornell.2018-10-01_00-01-49z.195samp.png b/img/cornell.2018-10-01_00-01-49z.195samp.png
new file mode 100644
index 0000000..33c33a3
Binary files /dev/null and b/img/cornell.2018-10-01_00-01-49z.195samp.png differ
diff --git a/img/focal10.png b/img/focal10.png
new file mode 100644
index 0000000..d5f5d0f
Binary files /dev/null and b/img/focal10.png differ
diff --git a/img/focal15.png b/img/focal15.png
new file mode 100644
index 0000000..ef2f0ab
Binary files /dev/null and b/img/focal15.png differ
diff --git a/img/lensize.png b/img/lensize.png
new file mode 100644
index 0000000..4fbac03
Binary files /dev/null and b/img/lensize.png differ
diff --git a/img/no_refract.png b/img/no_refract.png
new file mode 100644
index 0000000..1463cf1
Binary files /dev/null and b/img/no_refract.png differ
diff --git a/img/purple.png b/img/purple.png
new file mode 100644
index 0000000..7e3f8fb
Binary files /dev/null and b/img/purple.png differ
diff --git a/img/sc.png b/img/sc.png
new file mode 100644
index 0000000..d98912f
Binary files /dev/null and b/img/sc.png differ
diff --git a/img/scene.png b/img/scene.png
new file mode 100644
index 0000000..ac3a3c0
Binary files /dev/null and b/img/scene.png differ
diff --git a/img/sort_closed.png b/img/sort_closed.png
new file mode 100644
index 0000000..c238c29
Binary files /dev/null and b/img/sort_closed.png differ
diff --git a/img/sort_open.png b/img/sort_open.png
new file mode 100644
index 0000000..1c59ffc
Binary files /dev/null and b/img/sort_open.png differ
diff --git a/img/speed1.png b/img/speed1.png
new file mode 100644
index 0000000..b707a36
Binary files /dev/null and b/img/speed1.png differ
diff --git a/img/speed2.png b/img/speed2.png
new file mode 100644
index 0000000..e82dc49
Binary files /dev/null and b/img/speed2.png differ
diff --git a/img/time.png b/img/time.png
new file mode 100644
index 0000000..bbea4c6
Binary files /dev/null and b/img/time.png differ
diff --git a/img/white.png b/img/white.png
new file mode 100644
index 0000000..ece997e
Binary files /dev/null and b/img/white.png differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..5fdc76e 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -40,9 +40,9 @@ EMITTANCE   0
 
 // Specular white
 MATERIAL 4
-RGB         .98 .98 .98
+RGB         .98 0.5 .98
 SPECEX      0
-SPECRGB     .98 .98 .98
+SPECRGB     .98 .5 .98
 REFL        1
 REFR        0
 REFRIOR     0
diff --git a/scenes/cornell_blur.txt b/scenes/cornell_blur.txt
new file mode 100644
index 0000000..7bae0c8
--- /dev/null
+++ b/scenes/cornell_blur.txt
@@ -0,0 +1,171 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+LENSIZE 0.5
+FOCALLEN 15
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+SPEED 0  
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+SPEED 0  
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+SPEED 0   
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -15
+ROTAT       0 90 0
+SCALE       .01 10 10
+SPEED 0 
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+SPEED 0
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+SPEED 0
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -1 5 -2
+ROTAT       0 0 0
+SCALE       2 2 2
+SPEED 1.8
+
+// Ceiling light
+OBJECT 7
+cube
+material 0
+TRANS       0 10 -10
+ROTAT       0 0 0
+SCALE       3 .3 3
+SPEED 0  
+
+// Floor
+OBJECT 8
+cube
+material 1
+TRANS       0 0 -10
+ROTAT       0 0 0
+SCALE       10 .01 10
+SPEED 0  
+
+// Ceiling
+OBJECT 9
+cube
+material 1
+TRANS       0 10 -10
+ROTAT       0 0 90
+SCALE       .01 10 10
+SPEED 0  
+
+// Left wall
+OBJECT 10
+cube
+material 2
+TRANS       -5 5 -10
+ROTAT       0 0 0
+SCALE       .01 10 10
+SPEED 0  
+
+// Right wall
+OBJECT 11
+cube
+material 3
+TRANS       5 5 -10
+ROTAT       0 0 0
+SCALE       .01 10 10
+SPEED 0  
\ No newline at end of file
diff --git a/scenes/cornell_closed.txt b/scenes/cornell_closed.txt
new file mode 100644
index 0000000..5b79813
--- /dev/null
+++ b/scenes/cornell_closed.txt
@@ -0,0 +1,125 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 0.5 .98
+SPECEX      0
+SPECRGB     .98 .5 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 0
+LOOKAT      0 5 -5
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -1 4 -5
+ROTAT       0 0 0
+SCALE       1 1 1
+
+// Closed wall
+OBJECT 7
+cube
+material 1
+TRANS       0 5 5
+ROTAT       0 90 0
+SCALE       .01 10 10
diff --git a/scenes/cornell_dof.txt b/scenes/cornell_dof.txt
new file mode 100644
index 0000000..e708298
--- /dev/null
+++ b/scenes/cornell_dof.txt
@@ -0,0 +1,191 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  5000
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+LENSIZE 0.5
+FOCALLEN 15
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -15
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -1 1.5 -2
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Ceiling light
+OBJECT 7
+cube
+material 0
+TRANS       0 10 -10
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 8
+cube
+material 1
+TRANS       0 0 -10
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 9
+cube
+material 1
+TRANS       0 10 -10
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 10
+cube
+material 2
+TRANS       -5 5 -10
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 11
+cube
+material 3
+TRANS       5 5 -10
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 12
+sphere
+material 4
+TRANS       -1 1.5 2
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere
+OBJECT 13
+sphere
+material 4
+TRANS       -1 1.5 -6
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere
+OBJECT 14
+sphere
+material 4
+TRANS       -1 1.5 -9
+ROTAT       0 0 0
+SCALE       2 2 2
+
+// Sphere
+OBJECT 15
+sphere
+material 4
+TRANS       -1 1.5 -12
+ROTAT       0 0 0
+SCALE       2 2 2
diff --git a/scenes/cornell_refract.txt b/scenes/cornell_refract.txt
new file mode 100644
index 0000000..ca689d1
--- /dev/null
+++ b/scenes/cornell_refract.txt
@@ -0,0 +1,165 @@
+// Emissive material (light)
+MATERIAL 0
+RGB 1 1 1
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 5
+
+// Diffuse white
+MATERIAL 1
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse red
+MATERIAL 2
+RGB .85 .35 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Diffuse green
+MATERIAL 3
+RGB .35 .85 .35
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Specular white
+MATERIAL 4
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB .98 .98 .98
+REFL 1
+REFR 0
+REFRIOR 0
+EMITTANCE 0
+
+// Emissive material (light)
+MATERIAL 5
+RGB 0.98 .65 0
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 5
+
+// Refractive white
+MATERIAL 6
+RGB .98 .98 .98
+SPECEX 0
+SPECRGB .98 .98 .98
+REFL 0
+REFR 1
+REFRIOR 1.5
+EMITTANCE 0
+
+// Emissive material (light)
+MATERIAL 7
+RGB 0 0.7 0.98
+SPECEX 0
+SPECRGB 0 0 0
+REFL 0
+REFR 0
+REFRIOR 0
+EMITTANCE 2.5
+
+// Camera
+CAMERA
+RES 800 800
+FOVY 45
+ITERATIONS 5000
+DEPTH 8
+FILE cornell
+EYE 0.0 5 10.5
+LOOKAT 0 5 0
+UP 0 1 0
+DOFX	0
+DOFY	10
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS 0 10 0
+ROTAT 0 0 0
+SCALE 3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS 0 10 0
+ROTAT 0 0 90
+SCALE .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS 0 5 -5
+ROTAT 0 90 0
+SCALE .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS -5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS 5 5 0
+ROTAT 0 0 0
+SCALE .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS -2 3 0
+ROTAT 0 0 0
+SCALE 1 1 1
+
+// Floor light
+OBJECT 7
+cube
+material 5
+TRANS 0 0 0
+ROTAT 0 0 0
+SCALE 5 .3 5
+
+// Sphere
+OBJECT 8
+sphere
+material 6
+TRANS 3 2 0
+ROTAT 0 0 0
+SCALE 3 3 3
\ No newline at end of file
diff --git a/scenes/sphere.txt b/scenes/sphere.txt
index a74b545..dfcc691 100644
--- a/scenes/sphere.txt
+++ b/scenes/sphere.txt
@@ -25,4 +25,4 @@ sphere
 material 0
 TRANS       0 0 0
 ROTAT       0 0 0
-SCALE       3 3 3
+SCALE       3 3 3
\ No newline at end of file
diff --git a/src/interactions.h b/src/interactions.h
index 5ce3628..28f0735 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -41,6 +41,11 @@ glm::vec3 calculateRandomDirectionInHemisphere(
         + sin(around) * over * perpendicularDirection2;
 }
 
+
+//__host__ __device__
+//glm::vec3 calculateRandomDirectionReflective(
+//        glm::vec3 normal, thrust::default_random_engine &rng)
+
 /**
  * Scatter a ray with some probabilities according to the material properties.
  * For example, a diffuse surface scatters in a cosine-weighted hemisphere.
@@ -76,4 +81,28 @@ void scatterRay(
     // TODO: implement this.
     // A basic implementation of pure-diffuse shading will just call the
     // calculateRandomDirectionInHemisphere defined above.
+    thrust::uniform_real_distribution<float> u01(0, 1);
+
+    if (m.hasReflective > 0){
+        pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal);
+        pathSegment.color *= m.specular.color;
+    }else if (m.hasRefractive > 0){
+        float cos_theta = glm::dot(pathSegment.ray.direction, normal);
+        float R0 = powf((1 - m.indexOfRefraction) / (1 + m.indexOfRefraction), 2);
+        float R = R0 + (1 - R0) * powf((1 - fabs(cos_theta)), 5);
+        if (R > u01(rng)) pathSegment.ray.direction = glm::reflect(pathSegment.ray.direction, normal);
+        else {
+            if (cos_theta < 0)
+            pathSegment.ray.direction = glm::refract(pathSegment.ray.direction, normal, 1.0f / m.indexOfRefraction);
+            else
+                pathSegment.ray.direction = glm::refract(pathSegment.ray.direction, normal, m.indexOfRefraction);
+            pathSegment.color *= m.specular.color;
+        }   
+    }
+    else
+        pathSegment.ray.direction = calculateRandomDirectionInHemisphere(normal, rng);
+    pathSegment.color *= m.color;
+    pathSegment.ray.origin = intersect + 0.005f * pathSegment.ray.direction;
+
+
 }
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index c1ec122..86392fc 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -9,62 +9,73 @@
 #include "scene.h"
 #include "glm/glm.hpp"
 #include "glm/gtx/norm.hpp"
+#include "glm/gtc/matrix_transform.hpp"
+#include "glm/gtc/matrix_inverse.hpp"
+
 #include "utilities.h"
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
 
+#include "stream_compaction/efficient_sm.h"
+
 #define ERRORCHECK 1
+#define ANTI_ALIAS 0
+#define MOTION_BLUR 0
+#define DOF 0
+#define WORK_EFFICIENT 0 //  do not make this to one, feature not tested 
+#define CACHING 0
+
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
 #if ERRORCHECK
-    cudaDeviceSynchronize();
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	cudaDeviceSynchronize();
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
 #  ifdef _WIN32
-    getchar();
+	getchar();
 #  endif
-    exit(EXIT_FAILURE);
+	exit(EXIT_FAILURE);
 #endif
 }
 
 __host__ __device__
 thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
-    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
 }
 
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        glm::vec3 pix = image[index];
-
-        glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
-
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+							   int iter, glm::vec3* image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		glm::vec3 pix = image[index];
+
+		glm::ivec3 color;
+		color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
+		color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
+		color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
 static Scene * hst_scene = NULL;
@@ -74,43 +85,66 @@ static Material * dev_materials = NULL;
 static PathSegment * dev_paths = NULL;
 static ShadeableIntersection * dev_intersections = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
-// ...
+static ShadeableIntersection * dev_intersections_cache = NULL;
+static cudaEvent_t start, stop;
 
 void pathtraceInit(Scene *scene) {
-    hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+	hst_scene = scene;
+	const Camera &cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
-    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
-    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
+	cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
 
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
 
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+	// TODO: initialize any extra device memeory you need
+	cudaMalloc(&dev_intersections_cache, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections_cache, 0, pixelcount * sizeof(ShadeableIntersection));
 
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
 
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-    // TODO: initialize any extra device memeory you need
 
-    checkCUDAError("pathtraceInit");
+	checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
-    cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
-    // TODO: clean up any extra device memory you created
-
-    checkCUDAError("pathtraceFree");
+	cudaFree(dev_image);  // no-op if dev_image is null
+	cudaFree(dev_paths);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
+	cudaFree(dev_intersections);
+	cudaFree(dev_intersections_cache);
+	// TODO: clean up any extra device memory you created
+
+	checkCUDAError("pathtraceFree");
 }
 
+__device__ glm::vec2 ConcentricSampleDisk(const glm::vec2 & u){
+	glm::vec2 uoffset = 2.f * u - glm::vec2(1.0, 1.0);
+	if (uoffset == glm::vec2(0, 0)) return uoffset;
+	float theta, r;
+	if (fabsf(uoffset.x) > fabsf(uoffset.y)){
+		r = uoffset.x;
+		theta = PI/4 * (uoffset.y / uoffset.x);
+	}else{
+		r = uoffset.y;
+		theta = PI/2 - PI/4 * (uoffset.x - uoffset.y);
+	}
+
+	return r * glm::vec2(cos(theta), sin(theta));
+}
+
+
 /**
 * Generate PathSegments with rays from the camera through the screen into the
 * scene, which is the first bounce of rays.
@@ -129,31 +163,57 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		PathSegment & segment = pathSegments[index];
 
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		// TODO: implement antialiasing by jittering the ray
+		thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+		thrust::uniform_real_distribution<float> u01(0, 1);
+
+#if ANTI_ALIAS
 		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
+				- cam.right * cam.pixelLength.x * ((float)(x + u01(rng)) - (float)cam.resolution.x * 0.5f)
+				- cam.up * cam.pixelLength.y * ((float)(y + u01(rng)) - (float)cam.resolution.y * 0.5f)
+		);
+#else
+		segment.ray.direction = glm::normalize(cam.view
+				- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
+				- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
+		);
+#endif
+
+#if MOTION_BLUR
+		segment.time_diff = u01(rng);
+#endif
+
+#if DOF
+	glm::vec2 u(u01(rng), u01(rng));
+	glm::vec2 pLens = cam.lensSize * ConcentricSampleDisk(u);
+	glm::vec3 pFocus = segment.ray.origin + glm::abs(cam.focalLength / segment.ray.direction.z) * segment.ray.direction;
 
+	segment.ray.origin += pLens.x * cam.right + pLens.y * cam.up;
+	segment.ray.direction = glm::normalize(pFocus - segment.ray.origin);
+
+#endif
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
+
 	}
 }
 
+
+
 // TODO:
 // computeIntersections handles generating ray intersections ONLY.
 // Generating new rays is handled in your shader(s).
 // Feel free to modify the code below.
 __global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment * pathSegments
-	, Geom * geoms
-	, int geoms_size
-	, ShadeableIntersection * intersections
-	)
+		int depth
+		, int num_paths
+		, PathSegment * pathSegments
+		, Geom * geoms
+		, int geoms_size
+		, ShadeableIntersection * intersections
+)
 {
 	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
 
@@ -177,6 +237,18 @@ __global__ void computeIntersections(
 		{
 			Geom & geom = geoms[i];
 
+#if MOTION_BLUR
+			glm::vec3 interpolate_pos = (pathSegment.time_diff * geom.speed + 1) * geom.translation;
+			geom.transform = glm::translate(glm::mat4(), interpolate_pos) * 
+									 glm::rotate(glm::mat4(), geom.rotation.x * PI / 180, glm::vec3(1, 0, 0)) * 
+									 glm::rotate(glm::mat4(), geom.rotation.y * PI / 180, glm::vec3(0, 1, 0)) * 
+									 glm::rotate(glm::mat4(), geom.rotation.z * PI / 180, glm::vec3(0, 0, 1)) * 
+									 glm::scale(glm::mat4(), geom.scale);
+
+			geom.inverseTransform  = glm::inverse(geom.transform);
+			geom.invTranspose = glm::inverseTranspose(geom.transform);
+#endif
+
 			if (geom.type == CUBE)
 			{
 				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
@@ -208,6 +280,7 @@ __global__ void computeIntersections(
 			intersections[path_index].t = t_min;
 			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
 			intersections[path_index].surfaceNormal = normal;
+			intersections[path_index].point = intersect_point;
 		}
 	}
 }
@@ -221,48 +294,50 @@ __global__ void computeIntersections(
 // Note that this shader does NOT do a BSDF evaluation!
 // Your shaders should handle that - this can allow techniques such as
 // bump mapping.
-__global__ void shadeFakeMaterial (
-  int iter
-  , int num_paths
-	, ShadeableIntersection * shadeableIntersections
-	, PathSegment * pathSegments
-	, Material * materials
-	)
+__global__ void shadeMaterialNaive (
+		int iter
+		, int num_paths
+		, ShadeableIntersection * shadeableIntersections
+		, PathSegment * pathSegments
+		, Material * materials
+)
 {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      // Set up the RNG
-      // LOOK: this is how you use thrust's RNG! Please look at
-      // makeSeededRandomEngine as well.
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
-      thrust::uniform_real_distribution<float> u01(0, 1);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        pathSegments[idx].color *= (materialColor * material.emittance);
-      }
-      // Otherwise, do some pseudo-lighting computation. This is actually more
-      // like what you would expect from shading in a rasterizer like OpenGL.
-      // TODO: replace this! you should be able to start with basically a one-liner
-      else {
-        float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
-        pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
-        pathSegments[idx].color *= u01(rng); // apply some noise because why not
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      pathSegments[idx].color = glm::vec3(0.0f);
-    }
-  }
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < num_paths)
+	{
+		ShadeableIntersection intersection = shadeableIntersections[idx];
+		if (intersection.t > 0.0f) { // if the intersection exists...
+			// Set up the RNG
+			// LOOK: this is how you use thrust's RNG! Please look at
+			// makeSeededRandomEngine as well.
+			thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, pathSegments[idx].remainingBounces);
+			thrust::uniform_real_distribution<float> u01(0, 1);
+
+			Material material = materials[intersection.materialId];
+			glm::vec3 materialColor = material.color;
+
+			// If the material indicates that the object was a light, "light" the ray
+			if (material.emittance > 0.0f) {
+				pathSegments[idx].color *= (materialColor * material.emittance);
+				pathSegments[idx].remainingBounces = 0;
+			}
+				// Otherwise, do some pseudo-lighting computation. This is actually more
+				// like what you would expect from shading in a rasterizer like OpenGL.
+				// TODO: replace this! you should be able to start with basically a one-liner
+			else {
+				scatterRay(pathSegments[idx], intersection.point, intersection.surfaceNormal, material, rng);
+				pathSegments[idx].remainingBounces--;
+
+			}
+			// If there was no intersection, color the ray black.
+			// Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+			// used for opacity, in which case they can indicate "no opacity".
+			// This can be useful for post-processing and image compositing.
+		} else {
+			pathSegments[idx].color = glm::vec3(0.0f);
+			pathSegments[idx].remainingBounces = 0;
+		}
+	}
 }
 
 // Add the current iteration's output to the overall image
@@ -282,49 +357,64 @@ __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterati
  * of memory management
  */
 void pathtrace(uchar4 *pbo, int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	const int traceDepth = hst_scene->state.traceDepth;
+	const Camera &cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
 	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+	const dim3 blockSize2d(8, 8);
+	const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 	// 1D block for path tracing
 	const int blockSize1d = 128;
 
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * TODO: Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * TODO: Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally, add this iteration's results to the image. This has been done
-    //   for you.
-
-    // TODO: perform one iteration of path tracing
+
+	// motion blur, can be done in cpu as we don't have much objects 
+	#if MOTION_BLUR
+	Geom * geoms =  &(hst_scene->geoms)[0];
+	for (int i = 0; i < hst_scene->geoms.size(); i++){
+		if (iter > 1 && geoms[i].speed != 0.0f){
+			if (geoms[i].speed > 0 ) geoms[i].speed -= 0.001f;
+			else geoms[i].speed = 0.0f;
+		}
+	}
+	cudaMemcpy(dev_geoms, geoms, hst_scene->geoms.size()* sizeof(Geom), cudaMemcpyHostToDevice);
+	#endif
+
+	///////////////////////////////////////////////////////////////////////////
+
+	// Recap:
+	// * Initialize array of path rays (using rays that come out of the camera)
+	//   * You can pass the Camera object to that kernel.
+	//   * Each path ray must carry at minimum a (ray, color) pair,
+	//   * where color starts as the multiplicative identity, white = (1, 1, 1).
+	//   * This has already been done for you.
+	// * For each depth:
+	//   * Compute an intersection in the scene for each path ray.
+	//     A very naive version of this has been implemented for you, but feel
+	//     free to add more primitives and/or a better algorithm.
+	//     Currently, intersection distance is recorded as a parametric distance,
+	//     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+	//     * Color is attenuated (multiplied) by reflections off of any object
+	//   * TODO: Stream compact away all of the terminated paths.
+	//     You may use either your implementation or `thrust::remove_if` or its
+	//     cousins.
+	//     * Note that you can't really use a 2D kernel launch any more - switch
+	//       to 1D.
+	//   * TODO: Shade the rays that intersected something or didn't bottom out.
+	//     That is, color the ray by performing a color computation according
+	//     to the shader, then generate a new ray to continue the ray path.
+	//     We recommend just updating the ray's PathSegment in place.
+	//     Note that this step may come before or after stream compaction,
+	//     since some shaders you write may also cause a path to terminate.
+	// * Finally, add this iteration's results to the image. This has been done
+	//   for you.
+
+	// TODO: perform one iteration of path tracing
 
 	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
 	checkCUDAError("generate camera ray");
@@ -336,58 +426,124 @@ void pathtrace(uchar4 *pbo, int frame, int iter) {
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
 
-  bool iterationComplete = false;
+	bool iterationComplete = false;
 	while (!iterationComplete) {
 
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		// tracing
+		dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+
+		// compute the intersections and put them into cache
+		cudaEventRecord(start);
+		#if CACHING
+			if (depth == 0){
+				if (iter == 1){
+					computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
+							depth
+									, num_paths
+									, dev_paths
+									, dev_geoms
+									, hst_scene->geoms.size()
+									, dev_intersections
+					);
+					cudaMemcpy(dev_intersections_cache, dev_intersections, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+				}
+				else {
+					cudaMemcpy(dev_intersections, dev_intersections_cache, pixelcount * sizeof(ShadeableIntersection), cudaMemcpyDeviceToDevice);
+				}
+			}else{
+				computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
+						depth
+								, num_paths
+								, dev_paths
+								, dev_geoms
+								, hst_scene->geoms.size()
+								, dev_intersections
+				);
+			}
+		#else
+			computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
+						depth
+								, num_paths
+								, dev_paths
+								, dev_geoms
+								, hst_scene->geoms.size()
+								, dev_intersections
+				);
+		#endif
+
+		checkCUDAError("trace one bounce");
+		cudaDeviceSynchronize();
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		float miliseconds = 0;
+		cudaEventElapsedTime(&miliseconds, start, stop);
+		if (iter % 10 == 0) cout << "Computing intersections " << miliseconds << endl;
+
+		// sort by the materials
+		cudaEventRecord(start);
+		thrust::sort_by_key(thrust::device, dev_intersections, dev_intersections + num_paths, dev_paths, cmp_material());
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&miliseconds, start, stop);
+		if (iter % 10 == 0) cout << "Sorting " << miliseconds << endl;
+		// TODO:
+		// --- Shading Stage ---
+		// Shade path segments based on intersections and generate new rays by
+		// evaluating the BSDF.
+		// Start off with just a big kernel that handles all the different
+		// materials you have in the scenefile.
+		// TODO: compare between directly shading the path segments and shading
+		// path segments that have been reshuffled to be contiguous in memory.
+		cudaEventRecord(start);
+		shadeMaterialNaive<<<numblocksPathSegmentTracing, blockSize1d>>> (
+				iter,
+						num_paths,
+						dev_intersections,
+						dev_paths,
+						dev_materials
 		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-	depth++;
-
-
-	// TODO:
-	// --- Shading Stage ---
-	// Shade path segments based on intersections and generate new rays by
-  // evaluating the BSDF.
-  // Start off with just a big kernel that handles all the different
-  // materials you have in the scenefile.
-  // TODO: compare between directly shading the path segments and shading
-  // path segments that have been reshuffled to be contiguous in memory.
-
-  shadeFakeMaterial<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = true; // TODO: should be based off stream compaction results.
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&miliseconds, start, stop);
+		if (iter % 10 == 0) cout << "Shading " << miliseconds << endl;
+
+		cudaEventRecord(start);
+
+		#if WORK_EFFICIENT
+			int * indices_buff = new int [pixelcount];
+			num_paths = StreamCompaction::EfficientSM::compact(pixelcount, dev_paths, dev_paths, indices_buff);
+			delete[] indices_buff;
+		#else
+			// stream compaction with thrust
+			PathSegment* new_end = thrust::partition(thrust::device, dev_paths, dev_paths + num_paths, terminate_ray());
+			// TODO::this might be wrong, debug and check
+			// actually might need just stream compaction
+			num_paths = new_end - dev_paths;
+		#endif
+		cudaEventRecord(stop);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&miliseconds, start, stop);
+		if (iter % 10 == 0) cout << "stream compaction " << miliseconds << endl;
+
+		depth++;
+		iterationComplete = (num_paths <= 0) || (depth > traceDepth);
 	}
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+	// Assemble this iteration and apply it to the image
+	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+	finalGather<<<numBlocksPixels, blockSize1d>>>(pixelcount, dev_image, dev_paths);
 
-    ///////////////////////////////////////////////////////////////////////////
+	///////////////////////////////////////////////////////////////////////////
 
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
+	// Send results to OpenGL buffer for rendering
+	sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
 
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+	// Retrieve image from GPU
+	cudaMemcpy(hst_scene->state.image.data(), dev_image,
+			   pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    checkCUDAError("pathtrace");
+	checkCUDAError("pathtrace");
 }
diff --git a/src/preview.cpp b/src/preview.cpp
index 4eb0bc1..47f48bc 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -169,12 +169,28 @@ bool init() {
 }
 
 void mainLoop() {
+    double fps = 0;
+    double timebase = 0;
+    int frame = 0;
     while (!glfwWindowShouldClose(window)) {
         glfwPollEvents();
+        frame ++;
+        double time = glfwGetTime();
+
+        if (time - timebase > 1.0){
+            fps = frame / (time - timebase);
+            timebase = time;
+            frame = 0;
+        }
+
         runCuda();
 
-        string title = "CIS565 Path Tracer | " + utilityCore::convertIntToString(iteration) + " Iterations";
-        glfwSetWindowTitle(window, title.c_str());
+        std::ostringstream title;
+        title << "CIS565 Path Tracer | " + utilityCore::convertIntToString(iteration) + " Iterations [";
+        title.precision(1);
+        title << std::fixed << fps;
+        title << " fps]";
+        glfwSetWindowTitle(window, title.str().c_str());
 
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
         glBindTexture(GL_TEXTURE_2D, displayImage);
diff --git a/src/scene.cpp b/src/scene.cpp
index cbae043..06c789c 100644
--- a/src/scene.cpp
+++ b/src/scene.cpp
@@ -74,7 +74,9 @@ int Scene::loadGeom(string objectid) {
                 newGeom.rotation = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
             } else if (strcmp(tokens[0].c_str(), "SCALE") == 0) {
                 newGeom.scale = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
-            }
+            } else if (strcmp(tokens[0].c_str(), "SPEED") == 0) {
+                newGeom.speed = atof(tokens[1].c_str());
+            } 
 
             utilityCore::safeGetline(fp_in, line);
         }
@@ -124,6 +126,10 @@ int Scene::loadCamera() {
             camera.lookAt = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
         } else if (strcmp(tokens[0].c_str(), "UP") == 0) {
             camera.up = glm::vec3(atof(tokens[1].c_str()), atof(tokens[2].c_str()), atof(tokens[3].c_str()));
+        } else if (strcmp(tokens[0].c_str(), "LENSIZE") == 0) {
+            camera.lensSize = atof(tokens[1].c_str());
+        }else if (strcmp(tokens[0].c_str(), "FOCALLEN") == 0) {
+            camera.focalLength = atof(tokens[1].c_str());
         }
 
         utilityCore::safeGetline(fp_in, line);
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index b38b820..20cd6bc 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -26,6 +26,7 @@ struct Geom {
     glm::mat4 transform;
     glm::mat4 inverseTransform;
     glm::mat4 invTranspose;
+    float speed;
 };
 
 struct Material {
@@ -49,6 +50,8 @@ struct Camera {
     glm::vec3 right;
     glm::vec2 fov;
     glm::vec2 pixelLength;
+	float lensSize;
+	float focalLength;
 };
 
 struct RenderState {
@@ -64,6 +67,7 @@ struct PathSegment {
 	glm::vec3 color;
 	int pixelIndex;
 	int remainingBounces;
+	float time_diff;
 };
 
 // Use with a corresponding PathSegment to do:
@@ -72,5 +76,20 @@ struct PathSegment {
 struct ShadeableIntersection {
   float t;
   glm::vec3 surfaceNormal;
+  glm::vec3 point;
   int materialId;
 };
+
+
+struct cmp_material{
+    __host__ __device__ bool operator()(const ShadeableIntersection& a, const ShadeableIntersection& b) {
+    return a.materialId > b.materialId;
+  }
+};
+ 
+
+ struct terminate_ray{
+     __host__ __device__ bool operator()(const PathSegment& path){
+        return path.remainingBounces > 0;
+    }
+ };
\ No newline at end of file
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index ac358c9..c4aebf5 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,4 +1,8 @@
 set(SOURCE_FILES
+        "common.cu"
+        "common.h"
+        "efficient_sm.cu"
+        "efficient_sm.h"
     )
 
 cuda_add_library(stream_compaction
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..d570d59
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,31 @@
+#include "common.h"
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools, const PathSegment *idata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n){
+                bools[index] = (idata[index].remainingBounces != 0);
+            }
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, PathSegment *odata,
+                const PathSegment *idata, const int *bools, const int *indices) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index < n && bools[index]){
+                odata[indices[index]] = idata[index];
+            }
+        }
+
+    }
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..207a169
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#include <src/sceneStructs.h>
+
+#define blockSize 128
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const PathSegment *idata);
+
+        __global__ void kernScatter(int n, PathSegment *odata,
+                const PathSegment *idata, const int *bools, const int *indices);
+
+	    /**
+	    * This class is used for timing the performance
+	    * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+	    */
+	    class PerformanceTimer
+	    {
+	    public:
+		    PerformanceTimer()
+		    {
+			    cudaEventCreate(&event_start);
+			    cudaEventCreate(&event_end);
+		    }
+
+		    ~PerformanceTimer()
+		    {
+			    cudaEventDestroy(event_start);
+			    cudaEventDestroy(event_end);
+		    }
+
+		    void startCpuTimer()
+		    {
+			    if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+			    cpu_timer_started = true;
+
+			    time_start_cpu = std::chrono::high_resolution_clock::now();
+		    }
+
+		    void endCpuTimer()
+		    {
+			    time_end_cpu = std::chrono::high_resolution_clock::now();
+
+			    if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+			    std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+			    prev_elapsed_time_cpu_milliseconds =
+				    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+			    cpu_timer_started = false;
+		    }
+
+		    void startGpuTimer()
+		    {
+			    if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+			    gpu_timer_started = true;
+
+			    cudaEventRecord(event_start);
+		    }
+
+		    void endGpuTimer()
+		    {
+			    cudaEventRecord(event_end);
+			    cudaEventSynchronize(event_end);
+
+			    if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+			    cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+			    gpu_timer_started = false;
+		    }
+
+		    float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+		    {
+			    return prev_elapsed_time_cpu_milliseconds;
+		    }
+
+		    float getGpuElapsedTimeForPreviousOperation() //noexcept
+		    {
+			    return prev_elapsed_time_gpu_milliseconds;
+		    }
+
+		    // remove copy and move functions
+		    PerformanceTimer(const PerformanceTimer&) = delete;
+		    PerformanceTimer(PerformanceTimer&&) = delete;
+		    PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+		    PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+	    private:
+		    cudaEvent_t event_start = nullptr;
+		    cudaEvent_t event_end = nullptr;
+
+		    using time_point_t = std::chrono::high_resolution_clock::time_point;
+		    time_point_t time_start_cpu;
+		    time_point_t time_end_cpu;
+
+		    bool cpu_timer_started = false;
+		    bool gpu_timer_started = false;
+
+		    float prev_elapsed_time_cpu_milliseconds = 0.f;
+		    float prev_elapsed_time_gpu_milliseconds = 0.f;
+	    };
+    }
+}
diff --git a/stream_compaction/efficient_sm.cu b/stream_compaction/efficient_sm.cu
new file mode 100644
index 0000000..0b14318
--- /dev/null
+++ b/stream_compaction/efficient_sm.cu
@@ -0,0 +1,135 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient_sm.h"
+
+namespace StreamCompaction {
+    namespace EfficientSM {
+        using StreamCompaction::Common::PerformanceTimer;
+        PerformanceTimer& timer()
+        {
+            static PerformanceTimer timer;
+            return timer;
+        }
+
+
+        __global__ void kernEfficientScan(int N, int *odata, int *idata){
+            extern __shared__ int tmp[];
+            int index = threadIdx.x;
+            if (index >= N) return;
+
+            int offset = 1;
+            tmp[2 * index] = idata[2 * index];
+            tmp[2 * index + 1] = idata[2 * index + 1];
+            // up sweep
+            for (int d = (N >> 1); d > 0; d >>= 1){
+                __syncthreads();
+                if (index < d) tmp[offset * (2 * index + 2) - 1] += tmp[offset * (2 * index + 1) - 1];
+                offset <<= 1;
+            }
+            // clear last digit
+            if (index == 0) tmp[N - 1] = 0;
+            // down sweep
+            for (int d = 1; d < N; d <<= 1){
+                offset >>= 1;
+                __syncthreads();
+                if (index < d){
+                    int t = tmp[offset * (2 * index + 1) - 1];
+                    tmp[offset * (2 * index + 1) - 1] = tmp[offset * (2 * index + 2) - 1];
+                    tmp[offset * (2 * index + 2) - 1] += t;
+                }
+            }
+            __syncthreads();
+
+            odata[2 * index] = tmp[2 * index];
+            odata[2 * index + 1] = tmp[2 * index + 1];
+        }
+
+        /**
+         * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+         */
+        void scan(int n, int *odata, const int *idata) {
+            int N = 1 << ilog2ceil(n);
+            dim3 fullBlockPerGrid((N + blockSize - 1) / blockSize);
+            int* dev_in, *dev_out;
+
+            cudaMalloc((void**) &dev_in, N * sizeof(int));
+            // checkCUDAError("cudaMalloc dev_in failed");
+
+            cudaMalloc((void**) &dev_out, N * sizeof(int));
+            // checkCUDAError("cudaMalloc dev_out failed");
+
+            cudaMemset(dev_out, 0, sizeof(int) * N);
+            // checkCUDAError("cuda Memset failed");
+
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            // checkCUDAError("cudaMemcpy HostToDevice failed");
+
+            timer().startGpuTimer();
+
+            kernEfficientScan <<< fullBlockPerGrid, blockSize, 2 * N * sizeof(int) >>> (N, dev_out, dev_in);
+            // checkCUDAError("kernNaiveScan dev_in failed");
+
+
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+            // checkCUDAError("cudaMemcpy DeviceToHost failed");
+
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+
+        }
+
+        int compact(int n, PathSegment *odata, const PathSegment *idata, int* indices_buff) {
+
+            dim3 fullBlockPerGrid((n + blockSize - 1) / blockSize);
+            int* bools, *indices;
+
+            PathSegment *dev_in, *dev_out;
+            int num_element;
+
+            cudaMalloc((void**)&bools, sizeof(int) * n);
+            // checkCUDAError("cudaMalloc bools failed");
+            cudaMalloc((void**)&indices, sizeof(int) * n);
+            // checkCUDAError("cudaMalloc indices failed");
+            cudaMalloc((void**)&dev_out, sizeof(PathSegment) * n);
+            // checkCUDAError("cudaMalloc dev_out failed");
+            cudaMalloc((void**)&dev_in, sizeof(PathSegment) * n);
+            // checkCUDAError("cudaMalloc dev_in failed");
+
+            // lots of memcpy...
+
+            cudaMemcpy(dev_in, idata, sizeof(PathSegment) * n, cudaMemcpyHostToDevice);
+            // checkCUDAError("cudaMemcpyHostToDevice failed");
+
+            timer().startGpuTimer();
+            StreamCompaction::Common:: kernMapToBoolean<<<fullBlockPerGrid, blockSize>>>(n, bools, dev_in);
+            // checkCUDAError("kernMapToBoolean failed");
+
+            cudaMemcpy(indices_buff, bools, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            num_element = indices_buff[n - 1];
+            // checkCUDAError("cudaMemcpyDeviceToHost failed");
+
+            scan(n, indices_buff, indices_buff);
+            num_element += indices_buff[n - 1];
+
+            cudaMemcpy(indices, indices_buff, sizeof(int) * n, cudaMemcpyHostToDevice);
+            // checkCUDAError("cudaMemcpyHostToDevice failed");
+
+            StreamCompaction::Common::kernScatter<<<fullBlockPerGrid, blockSize>>>(n, dev_out, dev_in, bools, indices);
+
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_out, sizeof(PathSegment) * n, cudaMemcpyDeviceToHost);
+            // checkCUDAError("cudaMemcpyDeviceToHost failed");
+
+            cudaFree(bools);
+            cudaFree(indices);
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+
+            return num_element;
+        }
+    }
+}
diff --git a/stream_compaction/efficient_sm.h b/stream_compaction/efficient_sm.h
new file mode 100644
index 0000000..2148d78
--- /dev/null
+++ b/stream_compaction/efficient_sm.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.h"
+
+
+namespace StreamCompaction {
+    namespace EfficientSM {
+        StreamCompaction::Common::PerformanceTimer& timer();
+
+        void scan(int n, int *odata, const int *idata);
+        int compact(int n, PathSegment *odata, const PathSegment *idata, int* indices_buff);
+    }
+}
\ No newline at end of file