diff --git a/CMakeLists.txt b/CMakeLists.txt
index 162568b..1aa258e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(headers
     src/sceneStructs.h
     src/preview.h
     src/utilities.h
+    stream_compaction/common.h
     )
 
 set(sources
@@ -84,6 +85,7 @@ set(sources
     src/scene.cpp
     src/preview.cpp
     src/utilities.cpp
+    stream_compaction/common.cu
     )
 
 set(imgui
diff --git a/README.md b/README.md
index f044c82..aecf29c 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,240 @@
-CUDA Denoiser For CUDA Path Tracer
-==================================
+Project 4 CUDA A Trous Denoiser
+======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Raymond Yang
+	* [LinkedIn](https://www.linkedin.com/in/raymond-yang-b85b19168)
+	* Tested on: 
+		* 10/22/2021
+		* Windows 10
+		* NVIDIA GeForce GTX 1080 Ti. 
+	* Submitted on: 10/22/2021
 
-### (TODO: Your README)
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+## Introduction 
+The objective of this project was to implement A Trous denoiser detailed in this [paper](https://jo.dreggn.org/home/2010_atrous.pdf). To construct the denoised image, we modulate a noisy path traced image (represented as a color buffer) with a geometry buffer/G buffer (represented as an array of positions and an array of normals). Both elements of the G buffer are cahced on first bounce and only done once per render. 
 
+### Components of A Trous Denoiser
+Scene Colors:
+<p align="center">
+  <img src="img/a0.PNG" alt="drawing" width="400" />
+</p>
+
+Scene Positions (relative to camera origin):
+<p align="center">
+  <img src="img/a1.PNG" alt="drawing" width="400" />
+</p>
+
+Scene Normals: 
+<p align="center">
+  <img src="img/a2.PNG" alt="drawing" width="400" />
+</p>
+
+Scene Time to Intersect (unused): 
+<p align="center">
+  <img src="img/a3.PNG" alt="drawing" width="400" />
+</p>
+
+### Denoised Output
+<p align="center">
+  <img src="img/a4.PNG" alt="drawing" width="400" />
+</p>
+
+## Performance Analysis
+In this section, we compare the runtime costs of A Trous denoising, recorded in microseconds. For Simple Scene, we will use `scene/cornell_ceiling_light.txt`. For Slightly Less Simple Scene, we will use `scene/demo.txt`. For the purposes of consistency between runs, the following values will be hardcoded to the window unless otherwise specified: 
+* `Filter Size = 80`
+* `Color Weight = 8.f`
+* `Normal Weight = 1.f`
+* `Position Weight = 1.5f`
+
+### Simple Scene
+Baseline image for reference at 1000 iterations: 
+<p align="center">
+  <img src="img/b0.PNG" alt="drawing" width="800" />
+</p>
+Path trace time for 1000 iterations: 33679.50 ms
+Denoise time for 1000 iterations: 62.97 ms
+
+Path traced image after 10 iterations:
+<p align="center">
+  <img src="img/b1.PNG" alt="drawing" width="800" />
+</p>
+
+Path traced image after 10 iterations with A Trous denoiser:
+<p align="center">
+  <img src="img/b2.PNG" alt="drawing" width="800" />
+</p>
+
+#### Cost of Running Denoiser vs Path Tracer
+| Trial           | 1        | 2        | 3        | 4        | 5        |
+|-----------------|----------|----------|----------|----------|----------|
+| Path Trace Time | 326.05   | 338.88   | 336.49   | 345.53   | 338.01   |
+| Denoise Time    | 62.95    | 64.02    | 62.62    | 64.47    | 64.56    |
+
+Lower is better:
+<p align="center">
+  <img src="img/c0.PNG" alt="drawing" width="400" />
+</p>
+
+#### Cost of Running Denoiser vs Resolution
+| Resolution | 256x256   | 512x512   | 1024x1024 |
+|------------|-----------|-----------|-----------|
+| Time (ms)  | 4.76      | 20.41     | 116.61    |
+
+Lower is better:
+<p align="center">
+  <img src="img/c1.PNG" alt="drawing" width="400" />
+</p>
+
+Lower is better:
+<p align="center">
+  <img src="img/c3.PNG" alt="drawing" width="400" />
+</p>
+* The increase in time is proportional to the increase in pixelcount for smaller resolutions. 
+
+#### Cost of Running Denoiser vs Filter Size
+| Filter Size | 25    | 50    | 100   |
+|-------------|-------|-------|-------|
+| Time (ms)   | 45.66 | 60.41 | 65.53 |
+
+Lower is better:
+<p align="center">
+  <img src="img/c2.PNG" alt="drawing" width="400" />
+</p>
+
+* Note that the increase in time is not substanctial indicating that I am likely calculating the number of iterations required for A Trous denoise incorrectly. 
+
+### Slightly Less Simple Scene
+Baseline image for reference at 1000 iterations: 
+<p align="center">
+  <img src="img/d0.PNG" alt="drawing" width="800" />
+</p>
+Path trace time for 1000 iterations: 237650.00 ms
+Denoise time for 1000 iterations: 86.21 ms
+
+Path traced image after 10 iterations:
+<p align="center">
+  <img src="img/d1.PNG" alt="drawing" width="800" />
+</p>
+
+Path traced image after 10 iterations with A Trous denoiser:
+<p align="center">
+  <img src="img/d2.PNG" alt="drawing" width="800" />
+</p>
+
+#### Cost of Running Denoiser vs Path Tracer
+| Trial           | 1         | 2         | 3         | 4         | 5         |
+|-----------------|-----------|-----------|-----------|-----------|-----------|
+| Path Trace Time | 2431.57   | 2365.28   | 2374.86   | 2480.54   | 2387.57   |
+| Denoise Time    | 86.39     | 87.75     | 87.64     | 87.86     | 85.97     |
+
+Lower is better:
+<p align="center">
+  <img src="img/e0.PNG" alt="drawing" width="400" />
+</p>
+
+#### Cost of Running Denoiser vs Resolution
+| Resolution | 256x256   | 512x512   | 1024x1024 |
+|------------|-----------|-----------|-----------|
+| Time (ms)  | 6.61      | 30.41     | 134.44    |
+
+Lower is better:
+<p align="center">
+  <img src="img/e1.PNG" alt="drawing" width="400" />
+</p>
+
+Lower is better:
+<p align="center">
+  <img src="img/e2.PNG" alt="drawing" width="400" />
+</p>
+* The increase in time is proportional to the increase in pixelcount for tested resolutions. 
+
+#### Cost of Running Denoiser vs Filter Size
+| Filter Size | 25    | 50    | 100   |
+|-------------|-------|-------|-------|
+| Time (ms)   | 64.50 | 82.70 | 91.55 |
+
+Lower is better:
+<p align="center">
+  <img src="img/e3.PNG" alt="drawing" width="400" />
+</p>
+
+* Note that the increase in time is not substanctial indicating that I am likely calculating the number of iterations required for A Trous denoise incorrectly. 
+
+## Visual Analysis
+In this section, we compare visual effects of A Trous denoising. 
+
+### Visual Results vs Filter Size
+Filter size is the two dimensional area around a pixel. That is, for a filter size of 25 and a pixel located at `(i,j)` of the image, the surrounding `5x5` grid centered around the pixel are factored into the weighting A Trous denoiser. 
+* Since I am using a gaussian weighting spread of `5x5`, filter sizes below 25 are not considered because the area is smaller than the gaussian spread. 
+* Filter sizes in `[25, 81)`, are mapped `1:1` to the gaussian spread. Filter sizes in `[81, 100]` are mapped 2:1 to the gaussian spread. That is, the gaussian weight at `(-2, -2)` relative to the pixel will actually be the pixel located at `(-4, -4)` relative to the pixel. Likewise, `(1, 1)` is mapped to `(2, 2)`. 
+* NOTE: that this assumes I understand and implemented filter size correction. Given the observations above (runtime for a filter size of 50 is greater than runtime for a filter size of 25), it is very likely I have not implemented filter size correctly.
+
+`filterSize = 25`:
+<p align="center">
+  <img src="img/f0.PNG" alt="drawing" width="800" />
+</p>
+
+`filterSize = 50`:
+<p align="center">
+  <img src="img/f1.PNG" alt="drawing" width="800" />
+</p>
+
+`filterSize = 75`:
+<p align="center">
+  <img src="img/f2.PNG" alt="drawing" width="800" />
+</p>
+
+`filterSize = 100`:
+<p align="center">
+  <img src="img/f3.PNG" alt="drawing" width="800" />
+</p>
+
+The only substantive change seems to be between `filterSize = 25` and `filterSize = 50`.
+
+### Visual Results vs Material
+From left to right, the spheres are of properties reflective, refractive (with index of 1.5), and diffuse. Scene can be found in file `/scenes/cornell_ceiling_light_plus.txt`.
+
+1000 iterations:
+<p align="center">
+  <img src="img/g0.PNG" alt="drawing" width="800" />
+</p>
+
+10 iterations:
+<p align="center">
+  <img src="img/g1.PNG" alt="drawing" width="800" />
+</p>
+
+10 iterations with A Trous diffuse:
+<p align="center">
+  <img src="img/g2.PNG" alt="drawing" width="800" />
+</p>
+
+With my implementation, A Trous denoiser fails to adequately simulate refractive properties. It is moderately capable of simulating diffuse properties. It is capable of simulating reflective properties. 
+
+### Visual Results vs Lighting
+Small light uses scene: `/scenes/cornell.txt`
+Large light uses scene: `/scenes/cornell_ceiling_light.txt`
+
+Cornell Box with small ceiling light: 
+<p align="center">
+  <img src="img/h0.PNG" alt="drawing" width="800" />
+</p>
+
+Cornell Box with large ceiling light:
+<p align="center">
+  <img src="img/h1.PNG" alt="drawing" width="800" />
+</p>
+
+Cornell Box with small ceiling light and A Trous denoise:
+<p align="center">
+  <img src="img/h2.PNG" alt="drawing" width="800" />
+</p>
+
+Cornell Box with large ceiling light and A Trous denoise:
+<p align="center">
+  <img src="img/h3.PNG" alt="drawing" width="800" />
+</p>
+
+Given poorer lighting conditions, more "holes" are left in the image. As the A Trous denoiser interpolates neighboring colors for each pixel, it emphasizes darker colors leading to poorer results. 
\ No newline at end of file
diff --git a/img/a0.PNG b/img/a0.PNG
new file mode 100644
index 0000000..84c4ddd
Binary files /dev/null and b/img/a0.PNG differ
diff --git a/img/a1.PNG b/img/a1.PNG
new file mode 100644
index 0000000..bb2e760
Binary files /dev/null and b/img/a1.PNG differ
diff --git a/img/a2.PNG b/img/a2.PNG
new file mode 100644
index 0000000..3190dbf
Binary files /dev/null and b/img/a2.PNG differ
diff --git a/img/a3.PNG b/img/a3.PNG
new file mode 100644
index 0000000..81df20b
Binary files /dev/null and b/img/a3.PNG differ
diff --git a/img/a4.PNG b/img/a4.PNG
new file mode 100644
index 0000000..f2f05c9
Binary files /dev/null and b/img/a4.PNG differ
diff --git a/img/b0.PNG b/img/b0.PNG
new file mode 100644
index 0000000..4c55315
Binary files /dev/null and b/img/b0.PNG differ
diff --git a/img/b1.PNG b/img/b1.PNG
new file mode 100644
index 0000000..ff9cf86
Binary files /dev/null and b/img/b1.PNG differ
diff --git a/img/b2.PNG b/img/b2.PNG
new file mode 100644
index 0000000..4638966
Binary files /dev/null and b/img/b2.PNG differ
diff --git a/img/c0.PNG b/img/c0.PNG
new file mode 100644
index 0000000..be94eec
Binary files /dev/null and b/img/c0.PNG differ
diff --git a/img/c1.PNG b/img/c1.PNG
new file mode 100644
index 0000000..a142f83
Binary files /dev/null and b/img/c1.PNG differ
diff --git a/img/c2.PNG b/img/c2.PNG
new file mode 100644
index 0000000..76fc9f6
Binary files /dev/null and b/img/c2.PNG differ
diff --git a/img/c3.PNG b/img/c3.PNG
new file mode 100644
index 0000000..ed9b7d1
Binary files /dev/null and b/img/c3.PNG differ
diff --git a/img/d0.PNG b/img/d0.PNG
new file mode 100644
index 0000000..4022ab2
Binary files /dev/null and b/img/d0.PNG differ
diff --git a/img/d1.PNG b/img/d1.PNG
new file mode 100644
index 0000000..dc9d94c
Binary files /dev/null and b/img/d1.PNG differ
diff --git a/img/d2.PNG b/img/d2.PNG
new file mode 100644
index 0000000..79a2eb6
Binary files /dev/null and b/img/d2.PNG differ
diff --git a/img/e0.PNG b/img/e0.PNG
new file mode 100644
index 0000000..128a81e
Binary files /dev/null and b/img/e0.PNG differ
diff --git a/img/e1.PNG b/img/e1.PNG
new file mode 100644
index 0000000..fb17cfc
Binary files /dev/null and b/img/e1.PNG differ
diff --git a/img/e2.PNG b/img/e2.PNG
new file mode 100644
index 0000000..7e122f5
Binary files /dev/null and b/img/e2.PNG differ
diff --git a/img/e3.PNG b/img/e3.PNG
new file mode 100644
index 0000000..0dd71e8
Binary files /dev/null and b/img/e3.PNG differ
diff --git a/img/f0.PNG b/img/f0.PNG
new file mode 100644
index 0000000..a2b7913
Binary files /dev/null and b/img/f0.PNG differ
diff --git a/img/f1.PNG b/img/f1.PNG
new file mode 100644
index 0000000..a35d9f5
Binary files /dev/null and b/img/f1.PNG differ
diff --git a/img/f2.PNG b/img/f2.PNG
new file mode 100644
index 0000000..4b0c1bb
Binary files /dev/null and b/img/f2.PNG differ
diff --git a/img/f3.PNG b/img/f3.PNG
new file mode 100644
index 0000000..da4cc69
Binary files /dev/null and b/img/f3.PNG differ
diff --git a/img/g0.PNG b/img/g0.PNG
new file mode 100644
index 0000000..ed94eb8
Binary files /dev/null and b/img/g0.PNG differ
diff --git a/img/g1.PNG b/img/g1.PNG
new file mode 100644
index 0000000..b75bfe4
Binary files /dev/null and b/img/g1.PNG differ
diff --git a/img/g2.PNG b/img/g2.PNG
new file mode 100644
index 0000000..095929f
Binary files /dev/null and b/img/g2.PNG differ
diff --git a/img/h0.PNG b/img/h0.PNG
new file mode 100644
index 0000000..4542536
Binary files /dev/null and b/img/h0.PNG differ
diff --git a/img/h1.PNG b/img/h1.PNG
new file mode 100644
index 0000000..4274c97
Binary files /dev/null and b/img/h1.PNG differ
diff --git a/img/h2.PNG b/img/h2.PNG
new file mode 100644
index 0000000..78d2d80
Binary files /dev/null and b/img/h2.PNG differ
diff --git a/img/h3.PNG b/img/h3.PNG
new file mode 100644
index 0000000..7214bcb
Binary files /dev/null and b/img/h3.PNG differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff820..77ad551 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -52,7 +52,7 @@ EMITTANCE   0
 CAMERA
 RES         800 800
 FOVY        45
-ITERATIONS  5000
+ITERATIONS  10
 DEPTH       8
 FILE        cornell
 EYE         0.0 5 10.5
diff --git a/scenes/cornell_ceiling_light_plus.txt b/scenes/cornell_ceiling_light_plus.txt
new file mode 100644
index 0000000..54a596b
--- /dev/null
+++ b/scenes/cornell_ceiling_light_plus.txt
@@ -0,0 +1,153 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   1
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Refractive white
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// Diffuse white
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  10
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       10 .3 10
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere
+OBJECT 6
+sphere
+material 4
+TRANS       -4 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 7
+sphere
+material 5
+TRANS       0 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere
+OBJECT 8
+sphere
+material 6
+TRANS       4 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/scenes/demo.txt b/scenes/demo.txt
new file mode 100644
index 0000000..5534bba
--- /dev/null
+++ b/scenes/demo.txt
@@ -0,0 +1,528 @@
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Light White
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7
+
+// Reflective 
+MATERIAL 1
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 2
+RGB         0.35 0.85 0.85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 3
+RGB         0.85 0.35 0.85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 4
+RGB         0.85 0.85 0.35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 5
+RGB         0.35 0.85 0.35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 6
+RGB         0.35 0.35 0.85
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Refractive 
+MATERIAL 7
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 8
+RGB         0.98 0.98 0.98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 9
+RGB         0.43 0.34 0.24
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse 
+MATERIAL 10
+RGB         0.5 0.5 0.5
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Light White
+MATERIAL 11
+RGB         1 0.1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7
+
+// Light White
+MATERIAL 12
+RGB         1 0.1 0.1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7
+
+// Light White
+MATERIAL 13
+RGB         0.1 0.1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7
+
+// Light White
+MATERIAL 14
+RGB         0.1 1 0.1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   7
+
+// Diffuse Reflective
+MATERIAL 15
+RGB         0.35 0.75 0.35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0.4
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  10
+DEPTH       12
+FILE        cornell
+EYE         0 -10 50
+LOOKAT      0 30 0
+UP          0 1 0
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Ceiling light
+OBJECT 0
+cube
+material 11
+TRANS       -20 80 0
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 1
+cube
+material 11
+TRANS       20 80 0
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 2
+cube
+material 0
+TRANS       0 80 0
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 3
+cube
+material 12
+TRANS       -20 80 -20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 4
+cube
+material 13
+TRANS       20 80 -20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 5
+cube
+material 14
+TRANS       0 80 -20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 6
+cube
+material 13
+TRANS       -20 80 20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 7
+cube
+material 12
+TRANS       20 80 20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+// Ceiling light
+OBJECT 8
+cube
+material 14
+TRANS       0 80 20
+ROTAT       0 0 0
+SCALE       10 1 10
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Cube 
+OBJECT 8
+cube
+material 7
+TRANS       -12 38.960193405 0
+ROTAT       4 60 53
+SCALE       3.375 3.375 3.375
+
+// Cube 
+OBJECT 9
+cube
+material 1
+TRANS       -12 32.9939799387 0
+ROTAT       42 25 28
+SCALE       5.0625 5.0625 5.0625
+
+// Cube 
+OBJECT 10
+cube
+material 7
+TRANS       -12 24.0446597393 0
+ROTAT       19 63 72
+SCALE       7.59375 7.59375 7.59375
+
+// Cube 
+OBJECT 11
+cube
+material 7
+TRANS       -12 15.0953395399 0
+ROTAT       32 0 58
+SCALE       5.0625 5.0625 5.0625
+
+// Cube 
+OBJECT 12
+cube
+material 7
+TRANS       -12 9.12912607362 0
+ROTAT       80 14 27
+SCALE       3.375 3.375 3.375
+
+// Cube 
+OBJECT 13
+cube
+material 1
+TRANS       -12 5.15165042945 0
+ROTAT       67 69 47
+SCALE       2.25 2.25 2.25
+
+// Cube 
+OBJECT 14
+cube
+material 7
+TRANS       -12 2.5 0
+ROTAT       73 3 57
+SCALE       1.5 1.5 1.5
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Cube 
+OBJECT 15
+cube
+material 1
+TRANS       12 47.6954830935 0
+ROTAT       65 29 48
+SCALE       3.375 3.375 3.375
+
+// Cube 
+OBJECT 16
+cube
+material 1
+TRANS       12 41.7292696272 0
+ROTAT       44 16 24
+SCALE       5.0625 5.0625 5.0625
+
+// Cube 
+OBJECT 17
+cube
+material 7
+TRANS       12 35.7630561609 0
+ROTAT       34 41 88
+SCALE       3.375 3.375 3.375
+
+// Cube 
+OBJECT 18
+cube
+material 7
+TRANS       12 31.7855805167 0
+ROTAT       52 59 49
+SCALE       2.25 2.25 2.25
+
+// Cube 
+OBJECT 19
+cube
+material 7
+TRANS       12 29.1339300873 0
+ROTAT       84 70 54
+SCALE       1.5 1.5 1.5
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Cube 
+OBJECT 20
+cube
+material 7
+TRANS       0 43.3278382492 12
+ROTAT       18 21 68
+SCALE       7.529536 7.529536 7.529536
+
+// Cube 
+OBJECT 21
+cube
+material 7
+TRANS       0 35.6914949854 12
+ROTAT       71 41 16
+SCALE       5.37824 5.37824 5.37824
+
+// Cube 
+OBJECT 22
+cube
+material 7
+TRANS       0 30.2369640827 12
+ROTAT       36 20 1
+SCALE       3.8416 3.8416 3.8416
+
+// Cube 
+OBJECT 23
+cube
+material 1
+TRANS       0 26.3408705807 12
+ROTAT       87 76 24
+SCALE       2.744 2.744 2.744
+
+// Cube 
+OBJECT 24
+cube
+material 1
+TRANS       0 23.5579466507 12
+ROTAT       34 54 6
+SCALE       1.96 1.96 1.96
+
+// Cube 
+OBJECT 25
+cube
+material 7
+TRANS       0 21.5701438436 12
+ROTAT       62 74 44
+SCALE       1.4 1.4 1.4
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Cube 
+OBJECT 26
+cube
+material 7
+TRANS       0 43.3278382492 -12
+ROTAT       55 26 0
+SCALE       7.529536 7.529536 7.529536
+
+// Cube 
+OBJECT 27
+cube
+material 1
+TRANS       0 35.6914949854 -12
+ROTAT       29 75 49
+SCALE       5.37824 5.37824 5.37824
+
+// Cube 
+OBJECT 28
+cube
+material 7
+TRANS       0 30.2369640827 -12
+ROTAT       13 40 53
+SCALE       3.8416 3.8416 3.8416
+
+// Cube 
+OBJECT 29
+cube
+material 7
+TRANS       0 26.3408705807 -12
+ROTAT       82 83 8
+SCALE       2.744 2.744 2.744
+
+// Cube 
+OBJECT 30
+cube
+material 7
+TRANS       0 23.5579466507 -12
+ROTAT       39 88 60
+SCALE       1.96 1.96 1.96
+
+// Cube 
+OBJECT 31
+cube
+material 1
+TRANS       0 21.5701438436 -12
+ROTAT       4 15 84
+SCALE       1.4 1.4 1.4
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// Cube 
+OBJECT 32
+cube
+material 9
+TRANS       0 53.3278382492 0
+ROTAT       0 0 20
+SCALE       27 1.3 3
+
+// Cube 
+OBJECT 33
+cube
+material 9
+TRANS       0 54.5278382492 0
+ROTAT       0 0 20
+SCALE       3 1.3 27
+
+// Cube 
+OBJECT 34
+cube
+material 8
+TRANS       -12 27.9139191246 0
+ROTAT       0 0 0
+SCALE       0.5 45.8278382492 0.5
+
+// Cube 
+OBJECT 35
+cube
+material 8
+TRANS       12 45.2308841683 0
+ROTAT       0 0 0
+SCALE       0.5 28.1939081619 0.5
+
+// Cube 
+OBJECT 36
+cube
+material 8
+TRANS       0 38.2308841683 12
+ROTAT       0 0 0
+SCALE       0.5 35.1939081619 0.5
+
+// Cube 
+OBJECT 37
+cube
+material 8
+TRANS       0 38.2308841683 -12
+ROTAT       0 0 0
+SCALE       0.5 35.1939081619 0.5
+
+
+
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////
+
+// World
+OBJECT 38
+cube
+material 15
+TRANS       0 -10 0
+ROTAT       0 0 0
+SCALE       200 200 200
\ No newline at end of file
diff --git a/src/interactions.h b/src/interactions.h
index 144a9f5..f11c08f 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#define EPSILON_SCALE 10
 #include "intersections.h"
 
 /**
@@ -45,18 +46,52 @@ glm::vec3 calculateRandomDirectionInHemisphere(
  */
 __host__ __device__
 void scatterRay(
-		PathSegment & pathSegment,
-        glm::vec3 intersect,
-        glm::vec3 normal,
-        const Material &m,
-        thrust::default_random_engine &rng) {
-    glm::vec3 newDirection;
-    if (m.hasReflective) {
-        newDirection = glm::reflect(pathSegment.ray.direction, normal);
-    } else {
-        newDirection = calculateRandomDirectionInHemisphere(normal, rng);
+    PathSegment& pathSegment,
+    glm::vec3 intersect,
+    glm::vec3 normal,
+    const Material& m,
+    thrust::default_random_engine& rng) {
+    // TODO: implement this.
+    // A basic implementation of pure-diffuse shading will just call the
+    // calculateRandomDirectionInHemisphere defined above.
+
+    thrust::uniform_real_distribution<float> u01(0, 1);
+    float prob = u01(rng);
+    glm::vec3 rayVec = glm::normalize(pathSegment.ray.direction);
+    glm::vec3 norVec = glm::normalize(normal);
+
+    if (prob < m.hasReflective) {
+        // Reflection
+        pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * norVec;
+        pathSegment.ray.direction = glm::normalize(glm::reflect(rayVec, norVec));
     }
+    else if (prob < (m.hasReflective + m.hasRefractive)) {
+        // Refraction 
+        // Reference: https://raytracing.github.io/books/RayTracingInOneWeekend.html#dielectrics
+        float refractionRatio = (glm::dot(rayVec, norVec) > 0) ? m.indexOfRefraction : (1.0f / m.indexOfRefraction); // inside sphere : outside sphere
 
-    pathSegment.ray.direction = newDirection;
-    pathSegment.ray.origin = intersect + (newDirection * 0.0001f);
-}
+        float cosTheta = glm::min(glm::dot(-1.0f * rayVec, norVec), 1.0f);
+        float sinTheta = sqrt(1.0 - cosTheta * cosTheta);
+        bool cannotRefract = refractionRatio * sinTheta > 1.0f;
+
+        float r0 = (1 - refractionRatio) / (1 + refractionRatio);
+        r0 *= r0;
+        float schlickAppro = r0 + (1 - r0) * pow(1 - cosTheta, 5);
+        bool  schlickBool = schlickAppro > u01(rng);
+
+        pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * rayVec;
+        if (cannotRefract || schlickBool) {
+            pathSegment.ray.direction = glm::reflect(rayVec, norVec);
+        }
+        else {
+            // Snell's Law
+            pathSegment.ray.direction = glm::refract(rayVec, norVec, refractionRatio);
+        }
+    }
+    else {
+        // Diffusion
+        pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * norVec;
+        pathSegment.ray.direction = glm::normalize(calculateRandomDirectionInHemisphere(norVec, rng));
+    }
+    pathSegment.color *= m.color;
+}
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
index 4092ae4..d7a10d7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,6 +6,13 @@
 #include "../imgui/imgui_impl_glfw.h"
 #include "../imgui/imgui_impl_opengl3.h"
 
+/*******************************************************************
+* TIMER 0 for no timer
+* TIMER 1 for timing path tracer and denoise
+*******************************************************************/
+#define TIMER 1
+
+
 static std::string startTimeString;
 
 // For camera controls
@@ -25,9 +32,9 @@ int lastLoopIterations = 0;
 bool ui_showGbuffer = false;
 bool ui_denoise = false;
 int ui_filterSize = 80;
-float ui_colorWeight = 0.45f;
-float ui_normalWeight = 0.35f;
-float ui_positionWeight = 0.2f;
+float ui_colorWeight = 8.f;
+float ui_normalWeight = 1.f;
+float ui_positionWeight = 1.5f;
 bool ui_saveAndExit = false;
 
 static bool camchanged = true;
@@ -45,6 +52,17 @@ int iteration;
 int width;
 int height;
 
+static float timePT;
+static float timeAT;
+static bool  hasPrinted;
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer& timer()
+{
+    static PerformanceTimer timer;
+    return timer;
+}
+
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
@@ -144,12 +162,15 @@ void runCuda() {
         cameraPosition += cam.lookAt;
         cam.position = cameraPosition;
         camchanged = false;
+
+        //std::cout << cam.position.x << " " << cam.position.y << " " << cam.position.z << " " << std::endl;
       }
 
     // Map OpenGL buffer object for writing from CUDA on a single GPU
-    // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
+    // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
     if (iteration == 0) {
+        hasPrinted = false;
         pathtraceFree();
         pathtraceInit(scene);
     }
@@ -157,18 +178,53 @@ void runCuda() {
     uchar4 *pbo_dptr = NULL;
     cudaGLMapBufferObject((void**)&pbo_dptr, pbo);
 
+    int num_paths = -1; 
     if (iteration < ui_iterations) {
         iteration++;
 
         // execute the kernel
         int frame = 0;
-        pathtrace(frame, iteration);
+#if TIMER
+        // Start Timer
+        if (iteration == 1)
+        {
+            timePT = 0.f;
+        }
+        timer().startCpuTimer();
+#endif // TIMER
+        num_paths = pathtrace(frame, iteration, iteration == ui_iterations);
+#if TIMER
+        timer().endCpuTimer();
+        timePT += timer().getCpuElapsedTimeForPreviousOperation();
+        if (iteration == ui_iterations) {
+            std::cout << "Path-trace time for " << iteration << " iterations: " << timePT << "ms" << std::endl;
+        }
+#endif // TIMER
+    }
+
+    if (iteration == ui_iterations) {
+#if TIMER
+        // Start Timer
+        timeAT = 0.f;
+        if (!hasPrinted) {
+            timer().startCpuTimer();
+        }
+#endif // TIMER
+        denoise(ui_filterSize, ui_colorWeight, ui_positionWeight, ui_normalWeight);
+#if TIMER
+        if (!hasPrinted) {
+            hasPrinted = true;
+            timer().endCpuTimer();
+            timeAT += timer().getCpuElapsedTimeForPreviousOperation();
+            std::cout << "Denoise time for " << iteration << " iterations: " << timeAT << "ms\n\n" << std::endl;
+        }
+#endif // TIMER
     }
 
     if (ui_showGbuffer) {
       showGBuffer(pbo_dptr);
     } else {
-      showImage(pbo_dptr, iteration);
+      showImage(pbo_dptr, iteration, ui_denoise);
     }
 
     // unmap buffer object
diff --git a/src/main.h b/src/main.h
index 06d311a..8e3f281 100644
--- a/src/main.h
+++ b/src/main.h
@@ -19,6 +19,7 @@
 #include "pathtrace.h"
 #include "utilities.h"
 #include "scene.h"
+#include "../stream_compaction/common.h"
 
 using namespace std;
 
@@ -46,3 +47,4 @@ void runCuda();
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods);
 void mousePositionCallback(GLFWwindow* window, double xpos, double ypos);
 void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
+
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 23e5f90..f2dcbd7 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -1,433 +1,584 @@
-#include <cstdio>
-#include <cuda.h>
-#include <cmath>
-#include <thrust/execution_policy.h>
-#include <thrust/random.h>
-#include <thrust/remove.h>
-
-#include "sceneStructs.h"
-#include "scene.h"
-#include "glm/glm.hpp"
-#include "glm/gtx/norm.hpp"
-#include "utilities.h"
-#include "pathtrace.h"
-#include "intersections.h"
-#include "interactions.h"
-
-#define ERRORCHECK 1
-
-#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
-#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
-#if ERRORCHECK
-    cudaDeviceSynchronize();
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
-#  ifdef _WIN32
-    getchar();
-#  endif
-    exit(EXIT_FAILURE);
-#endif
-}
-
-__host__ __device__
-thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
-    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
-}
-
-//Kernel that writes the image to the OpenGL PBO directly.
-__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        glm::vec3 pix = image[index];
-
-        glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
-
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
-}
-
-__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        float timeToIntersect = gBuffer[index].t * 256.0;
-
-        pbo[index].w = 0;
-        pbo[index].x = timeToIntersect;
-        pbo[index].y = timeToIntersect;
-        pbo[index].z = timeToIntersect;
-    }
-}
-
-static Scene * hst_scene = NULL;
-static glm::vec3 * dev_image = NULL;
-static Geom * dev_geoms = NULL;
-static Material * dev_materials = NULL;
-static PathSegment * dev_paths = NULL;
-static ShadeableIntersection * dev_intersections = NULL;
-static GBufferPixel* dev_gBuffer = NULL;
-// TODO: static variables for device memory, any extra info you need, etc
-// ...
-
-void pathtraceInit(Scene *scene) {
-    hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
-
-    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
-    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
-
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
-
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
-
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
-
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-    cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
-
-    // TODO: initialize any extra device memeory you need
-
-    checkCUDAError("pathtraceInit");
-}
-
-void pathtraceFree() {
-    cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
-    cudaFree(dev_gBuffer);
-    // TODO: clean up any extra device memory you created
-
-    checkCUDAError("pathtraceFree");
-}
-
-/**
-* Generate PathSegments with rays from the camera through the screen into the
-* scene, which is the first bounce of rays.
-*
-* Antialiasing - add rays for sub-pixel sampling
-* motion blur - jitter rays "in time"
-* lens effect - jitter ray origin positions based on a lens
-*/
-__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
-{
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-	if (x < cam.resolution.x && y < cam.resolution.y) {
-		int index = x + (y * cam.resolution.x);
-		PathSegment & segment = pathSegments[index];
-
-		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
-
-		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
-
-		segment.pixelIndex = index;
-		segment.remainingBounces = traceDepth;
-	}
-}
-
-__global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment * pathSegments
-	, Geom * geoms
-	, int geoms_size
-	, ShadeableIntersection * intersections
-	)
-{
-	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (path_index < num_paths)
-	{
-		PathSegment pathSegment = pathSegments[path_index];
-
-		float t;
-		glm::vec3 intersect_point;
-		glm::vec3 normal;
-		float t_min = FLT_MAX;
-		int hit_geom_index = -1;
-		bool outside = true;
-
-		glm::vec3 tmp_intersect;
-		glm::vec3 tmp_normal;
-
-		// naive parse through global geoms
-
-		for (int i = 0; i < geoms_size; i++)
-		{
-			Geom & geom = geoms[i];
-
-			if (geom.type == CUBE)
-			{
-				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
-			}
-			else if (geom.type == SPHERE)
-			{
-				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
-			}
-
-			// Compute the minimum t from the intersection tests to determine what
-			// scene geometry object was hit first.
-			if (t > 0.0f && t_min > t)
-			{
-				t_min = t;
-				hit_geom_index = i;
-				intersect_point = tmp_intersect;
-				normal = tmp_normal;
-			}
-		}
-
-		if (hit_geom_index == -1)
-		{
-			intersections[path_index].t = -1.0f;
-		}
-		else
-		{
-			//The ray hits something
-			intersections[path_index].t = t_min;
-			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
-			intersections[path_index].surfaceNormal = normal;
-		}
-	}
-}
-
-__global__ void shadeSimpleMaterials (
-  int iter
-  , int num_paths
-	, ShadeableIntersection * shadeableIntersections
-	, PathSegment * pathSegments
-	, Material * materials
-	)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    PathSegment segment = pathSegments[idx];
-    if (segment.remainingBounces == 0) {
-      return;
-    }
-
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      segment.remainingBounces--;
-      // Set up the RNG
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        segment.color *= (materialColor * material.emittance);
-        segment.remainingBounces = 0;
-      }
-      else {
-        segment.color *= materialColor;
-        glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
-        scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      segment.color = glm::vec3(0.0f);
-      segment.remainingBounces = 0;
-    }
-
-    pathSegments[idx] = segment;
-  }
-}
-
-__global__ void generateGBuffer (
-  int num_paths,
-  ShadeableIntersection* shadeableIntersections,
-	PathSegment* pathSegments,
-  GBufferPixel* gBuffer) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    gBuffer[idx].t = shadeableIntersections[idx].t;
-  }
-}
-
-// Add the current iteration's output to the overall image
-__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
-{
-	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-	if (index < nPaths)
-	{
-		PathSegment iterationPath = iterationPaths[index];
-		image[iterationPath.pixelIndex] += iterationPath.color;
-	}
-}
-
-/**
- * Wrapper for the __global__ call that sets up the kernel calls and does a ton
- * of memory management
- */
-void pathtrace(int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
-
-	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-	// 1D block for path tracing
-	const int blockSize1d = 128;
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Pathtracing Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * NEW: For the first depth, generate geometry buffers (gbuffers)
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally:
-    //     * if not denoising, add this iteration's results to the image
-    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
-
-	int depth = 0;
-	PathSegment* dev_path_end = dev_paths + pixelcount;
-	int num_paths = dev_path_end - dev_paths;
-
-	// --- PathSegment Tracing Stage ---
-	// Shoot ray into scene, bounce between objects, push shading chunks
-
-  // Empty gbuffer
-  cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
-
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-  bool iterationComplete = false;
-	while (!iterationComplete) {
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-
-  if (depth == 0) {
-    generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
-  }
-
-	depth++;
-
-  shadeSimpleMaterials<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = depth == traceDepth;
-	}
-
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
-    // Otherwise, screenshots are also acceptable.
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
-
-    checkCUDAError("pathtrace");
-}
-
-// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
-void showGBuffer(uchar4* pbo) {
-    const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-    // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-}
-
-void showImage(uchar4* pbo, int iter) {
-const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
-}
+#include <cstdio>
+#include <cuda.h>
+#include <cmath>
+#include <thrust/execution_policy.h>
+#include <thrust/random.h>
+#include <thrust/remove.h>
+
+#include "sceneStructs.h"
+#include "scene.h"
+#include "glm/glm.hpp"
+#include "glm/gtx/norm.hpp"
+#include "utilities.h"
+#include "pathtrace.h"
+#include "intersections.h"
+#include "interactions.h"
+
+#define ERRORCHECK 0
+
+/*******************************************************************
+* GBUFFER_RENDER 0 for gbufferToPBO to render intersects as color
+* GBUFFER_RENDER 1 for gbufferToPBO to render positions as color
+* GBUFFER_RENDER 2 for gbufferToPBO to render normals as color
+*******************************************************************/
+#define GBUFFER_RENDER 0
+
+/*******************************************************************
+* BLOCK_LENGTH  is 1D lenght of blocks per grid
+*******************************************************************/
+#define BLOCK_LENGTH  8
+// TODO: update this to be variable length given FILTER_LENGTH
+const float gaussian[25] = {
+    0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
+    0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+    0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
+    0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+    0.003765, 0.015019, 0.023792, 0.015019, 0.003765 };
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+#if ERRORCHECK
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if (cudaSuccess == err) {
+        return;
+    }
+
+    fprintf(stderr, "CUDA error");
+    if (file) {
+        fprintf(stderr, " (%s:%d)", file, line);
+    }
+    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+#  ifdef _WIN32
+    getchar();
+#  endif
+    exit(EXIT_FAILURE);
+#endif
+}
+
+__host__ __device__
+thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
+    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+    return thrust::default_random_engine(h);
+}
+
+//Kernel that writes the image to the OpenGL PBO directly.
+__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, int iter, glm::vec3* image) 
+{
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < resolution.x && y < resolution.y) {
+        int index = x + (y * resolution.x);
+        glm::vec3 pix = image[index];
+
+        glm::ivec3 color;
+        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
+        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
+        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
+
+        // Each thread writes one pixel location in the texture (textel)
+        pbo[index].w = 0;
+        pbo[index].x = color.x;
+        pbo[index].y = color.y;
+        pbo[index].z = color.z;
+    }
+}
+
+__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < resolution.x && y < resolution.y) {
+        int index = x + (y * resolution.x);
+
+#if   GBUFFER_RENDER == 0
+        // Intersect 
+        float timeToIntersect = gBuffer[index].t * 255.f;
+        pbo[index].w = 0;
+        pbo[index].x = timeToIntersect;
+        pbo[index].y = timeToIntersect;
+        pbo[index].z = timeToIntersect;
+#elif GBUFFER_RENDER == 1
+        glm::vec4 posToColor = glm::vec4(glm::normalize(glm::abs(gBuffer[index].pos)) * 255.f, 0.f);
+        pbo[index].w = posToColor.w;
+        pbo[index].x = posToColor.x;
+        pbo[index].y = posToColor.y;
+        pbo[index].z = posToColor.z;
+#elif GBUFFER_RENDER == 2
+        glm::vec4 norToColor = glm::vec4(glm::normalize(glm::abs(gBuffer[index].nor)) * 255.f, 0.f);
+        pbo[index].w = norToColor.w;
+        pbo[index].x = norToColor.x;
+        pbo[index].y = norToColor.y;
+        pbo[index].z = norToColor.z;
+#endif // GBUFFER_RENDER 
+    }
+}
+
+static Scene* hst_scene = NULL;
+static glm::vec3* dev_image = NULL;
+static glm::vec3* dev_imageDenoise = NULL; 
+static glm::vec3* dev_imageDenoiseDup = NULL;
+static Geom* dev_geoms = NULL;
+static Material* dev_materials = NULL;
+static PathSegment* dev_paths = NULL;
+static ShadeableIntersection* dev_intersections = NULL;
+static GBufferPixel* dev_gBuffer = NULL;
+static float* dev_gaussian = NULL;
+// TODO: static variables for device memory, any extra info you need, etc
+// ...
+
+void pathtraceInit(Scene *scene) {
+    hst_scene = scene;
+    const Camera &cam = hst_scene->state.camera;
+    const int pixelcount = cam.resolution.x * cam.resolution.y;
+
+    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
+
+    cudaMalloc(&dev_imageDenoise, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_imageDenoise, 0, pixelcount * sizeof(glm::vec3));
+
+    cudaMalloc(&dev_imageDenoiseDup, pixelcount * sizeof(glm::vec3));
+    cudaMemset(dev_imageDenoiseDup, 0, pixelcount * sizeof(glm::vec3));
+
+  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+
+  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+    cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
+
+    cudaMalloc(&dev_gaussian, 25 * sizeof(float));
+    cudaMemcpy(dev_gaussian, gaussian, 25 * sizeof(float), cudaMemcpyHostToDevice);
+
+    // TODO: initialize any extra device memeory you need
+
+    checkCUDAError("pathtraceInit");
+}
+
+void pathtraceFree() {
+    cudaFree(dev_image);  // no-op if dev_image is null
+    cudaFree(dev_imageDenoise);
+    cudaFree(dev_imageDenoiseDup);
+  	cudaFree(dev_paths);
+  	cudaFree(dev_geoms);
+  	cudaFree(dev_materials);
+  	cudaFree(dev_intersections);
+    cudaFree(dev_gBuffer);
+    cudaFree(dev_gaussian);
+    // TODO: clean up any extra device memory you created
+
+    checkCUDAError("pathtraceFree");
+}
+
+/**
+* Generate PathSegments with rays from the camera through the screen into the
+* scene, which is the first bounce of rays.
+*
+* Antialiasing - add rays for sub-pixel sampling
+* motion blur - jitter rays "in time"
+* lens effect - jitter ray origin positions based on a lens
+*/
+__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
+{
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < cam.resolution.x && y < cam.resolution.y) {
+		int index = x + (y * cam.resolution.x);
+		PathSegment & segment = pathSegments[index];
+
+        thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0);
+        thrust::uniform_real_distribution<float> uhh(-0.5f, 0.5f);
+
+		segment.ray.origin = cam.position;
+        segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+
+        segment.ray.direction = glm::normalize(
+            cam.view -
+            cam.right * cam.pixelLength.x * ((float)x + uhh(rng) - (float)cam.resolution.x * 0.5f) -
+            cam.up * cam.pixelLength.y * ((float)y + uhh(rng) - (float)cam.resolution.y * 0.5f));
+
+		segment.pixelIndex = index;
+		segment.remainingBounces = traceDepth;
+	}
+}
+
+__global__ void computeIntersections(
+	int depth, 
+    int num_paths, 
+    PathSegment * pathSegments,
+    Geom * geoms, 
+    int geoms_size, 
+    ShadeableIntersection * intersections)
+{
+	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (path_index < num_paths)
+	{
+		PathSegment pathSegment = pathSegments[path_index];
+
+		float t;
+		glm::vec3 intersect_point;
+		glm::vec3 normal;
+		float t_min = FLT_MAX;
+		int hit_geom_index = -1;
+		bool outside = true;
+
+		glm::vec3 tmp_intersect;
+		glm::vec3 tmp_normal;
+
+		// naive parse through global geoms
+
+		for (int i = 0; i < geoms_size; i++)
+		{
+			Geom & geom = geoms[i];
+
+			if (geom.type == CUBE)
+			{
+				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+			}
+			else if (geom.type == SPHERE)
+			{
+				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+			}
+
+			// Compute the minimum t from the intersection tests to determine what
+			// scene geometry object was hit first.
+			if (t > 0.0f && t_min > t)
+			{
+				t_min = t;
+				hit_geom_index = i;
+				intersect_point = tmp_intersect;
+				normal = tmp_normal;
+			}
+		}
+
+		if (hit_geom_index == -1)
+		{
+			intersections[path_index].t = -1.0f;
+		}
+		else
+		{
+			//The ray hits something
+			intersections[path_index].t = t_min;
+			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
+			intersections[path_index].surfaceNormal = normal;
+		}
+	}
+}
+
+__global__ void shadeSimpleMaterials (
+    int iter, 
+    int num_paths, 
+    ShadeableIntersection * shadeableIntersections, 
+    PathSegment * pathSegments, 
+    Material * materials)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_paths)
+  {
+    ShadeableIntersection intersection = shadeableIntersections[idx];
+    PathSegment segment = pathSegments[idx];
+    if (segment.remainingBounces == 0) {
+      return;
+    }
+
+    if (intersection.t > 0.0f) { // if the intersection exists...
+      segment.remainingBounces--;
+      // Set up the RNG
+      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
+
+      Material material = materials[intersection.materialId];
+      glm::vec3 materialColor = material.color;
+
+      // If the material indicates that the object was a light, "light" the ray
+      if (material.emittance > 0.0f) {
+        segment.color *= (materialColor * material.emittance);
+        segment.remainingBounces = 0;
+      }
+      else {
+        glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
+        scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
+      }
+    // If there was no intersection, color the ray black.
+    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+    // used for opacity, in which case they can indicate "no opacity".
+    // This can be useful for post-processing and image compositing.
+    } else {
+      segment.color = glm::vec3(0.0f);
+      segment.remainingBounces = 0;
+    }
+
+    pathSegments[idx] = segment;
+  }
+}
+
+__global__ void generateGBuffer (
+    int num_paths,
+    ShadeableIntersection* shadeableIntersections,
+    PathSegment* pathSegments,
+    GBufferPixel* gBuffer) 
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < num_paths)
+    {
+        ShadeableIntersection& i = shadeableIntersections[index];
+        Ray&                   r = pathSegments[index].ray;
+
+        gBuffer[index].t   = i.t;
+        gBuffer[index].pos = r.origin + i.t * r.direction;
+        gBuffer[index].nor = i.surfaceNormal;
+
+    }
+}
+
+// Add the current iteration's output to the overall image
+__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
+{
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+	if (index < nPaths)
+	{
+		PathSegment iterationPath = iterationPaths[index];
+		image[iterationPath.pixelIndex] += iterationPath.color;
+	}
+
+    //showImage(uchar4 * pbo, int iter, bool ui_denoise)
+}
+    
+/**
+ * Wrapper for the __global__ call that sets up the kernel calls and does a ton
+ * of memory management
+ */
+int pathtrace(int frame, int iter, int lastIter) {
+    const int traceDepth = hst_scene->state.traceDepth;
+    const Camera &cam = hst_scene->state.camera;
+    const int pixelcount = cam.resolution.x * cam.resolution.y;
+
+	// 2D block for generating ray from camera
+    const dim3 blockSize2d(8, 8);
+    const dim3 blocksPerGrid2d(
+            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+	// 1D block for path tracing
+	const int blockSize1d = 128;
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    // Pathtracing Recap:
+    // * Initialize array of path rays (using rays that come out of the camera)
+    //   * You can pass the Camera object to that kernel.
+    //   * Each path ray must carry at minimum a (ray, color) pair,
+    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
+    //   * This has already been done for you.
+    // * NEW: For the first depth, generate geometry buffers (gbuffers)
+    // * For each depth:
+    //   * Compute an intersection in the scene for each path ray.
+    //     A very naive version of this has been implemented for you, but feel
+    //     free to add more primitives and/or a better algorithm.
+    //     Currently, intersection distance is recorded as a parametric distance,
+    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+    //     * Color is attenuated (multiplied) by reflections off of any object
+    //   * Stream compact away all of the terminated paths.
+    //     You may use either your implementation or `thrust::remove_if` or its
+    //     cousins.
+    //     * Note that you can't really use a 2D kernel launch any more - switch
+    //       to 1D.
+    //   * Shade the rays that intersected something or didn't bottom out.
+    //     That is, color the ray by performing a color computation according
+    //     to the shader, then generate a new ray to continue the ray path.
+    //     We recommend just updating the ray's PathSegment in place.
+    //     Note that this step may come before or after stream compaction,
+    //     since some shaders you write may also cause a path to terminate.
+    // * Finally:
+    //     * if not denoising, add this iteration's results to the image
+    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
+
+    generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
+    checkCUDAError("generate camera ray");
+
+    int depth = 0;
+    PathSegment* dev_path_end = dev_paths + pixelcount;
+    int num_paths = dev_path_end - dev_paths;
+    int ret = (lastIter) ? num_paths : -1; 
+
+    // --- PathSegment Tracing Stage ---
+    // Shoot ray into scene, bounce between objects, push shading chunks
+
+    // Empty gbuffer
+    cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
+
+    // clean shading chunks
+    cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+    bool iterationComplete = false;
+	while (!iterationComplete) {
+
+	    dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+	    computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
+		    depth, 
+            num_paths, 
+            dev_paths, 
+            dev_geoms, 
+            hst_scene->geoms.size(), 
+            dev_intersections);
+	    checkCUDAError("trace one bounce");
+	    cudaDeviceSynchronize();
+
+        if (depth == 0 && lastIter) {
+            generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
+        }
+        depth++;
+
+        shadeSimpleMaterials<<<numblocksPathSegmentTracing, blockSize1d>>> (
+            iter,
+            num_paths,
+            dev_intersections,
+            dev_paths,
+            dev_materials);
+        iterationComplete = depth == traceDepth;
+
+	}
+
+    // Assemble this iteration and apply it to the image
+    dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+    finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+    ///////////////////////////////////////////////////////////////////////////
+
+    // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
+    // Otherwise, screenshots are also acceptable.
+    // Retrieve image from GPU
+    cudaMemcpy(hst_scene->state.image.data(), dev_image,
+            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+
+    checkCUDAError("pathtrace");
+    cudaDeviceSynchronize();
+    return ret;
+}
+
+// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
+void showGBuffer(uchar4* pbo) {
+    const Camera &cam = hst_scene->state.camera;
+    const dim3 blockSize2d(8, 8);
+    const dim3 blocksPerGrid2d(
+            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+    // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
+    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+}
+
+void showImage(uchar4* pbo, int iter, bool ui_denoise) {
+const Camera &cam = hst_scene->state.camera;
+    const dim3 blockSize2d(8, 8);
+    const dim3 blocksPerGrid2d(
+            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+    // Send results to OpenGL buffer for rendering
+    if (ui_denoise) {
+        sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_imageDenoise);
+    }
+    else {
+        sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_image);
+    }
+}
+
+__global__ void denoiseIter(
+    const Camera cam,
+    const int step, 
+    const float c_phi, 
+    const float p_phi,
+    const float n_phi,
+    const float* gaussian, 
+    glm::vec3* imageDenoise, 
+    const glm::vec3* image, 
+    const GBufferPixel* gBuffer) 
+{
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < cam.resolution.x && y < cam.resolution.y) {
+
+        glm::vec3 sumColor = glm::vec3(0.f);
+        float     sumWeight = 0.f;
+
+        int index = x + (y * cam.resolution.x);
+
+        glm::vec3 col = image[index];
+        glm::vec3 pos = gBuffer[index].pos;
+        glm::vec3 nor = gBuffer[index].nor;
+
+        for (int j = -2; j <= 2; j++) {
+
+            //int relY = glm::clamp(y + j * step, 0, cam.resolution.y);
+            int relY = y + j * step; 
+            if (relY < 0 || relY >= cam.resolution.y) continue;
+
+            for (int i = -2; i <= 2; i++) {
+
+                //int relX = glm::clamp(x + i * step, 0, cam.resolution.x);
+                int relX = x + i * step;
+                if (relX < 0 || relX >= cam.resolution.x) continue;
+
+                int relIndex = relX + cam.resolution.x * relY;
+
+                glm::vec3 t; glm::vec3 colTemp; float dist2;
+                // Color weighting
+                colTemp = image[relIndex];
+                t = col - colTemp;
+                dist2 = glm::dot(t, t);
+                float c_w = glm::min(std::exp(-(dist2) / (c_phi + EPSILON)), 1.f);
+
+                // Position weighting  
+                t = pos - gBuffer[relIndex].pos;
+                dist2 = glm::dot(t, t);
+                float p_w = glm::min(std::exp(-(dist2) / (p_phi + EPSILON)), 1.f);
+
+                // Normal weighting 
+                t = nor - gBuffer[relIndex].nor;
+                dist2 = glm::max(glm::dot(t, t), 0.f);
+                float n_w = glm::min(std::exp(-(dist2) / (n_phi + EPSILON)), 1.f);
+
+                //float weight = c_w * c_w * p_w * p_w * n_w * n_w;
+                float weight =  c_w * p_w * n_w;
+                float influence = weight * gaussian[((i + 2) + 5 * (j + 2))];
+                sumColor  += (colTemp * influence);
+                sumWeight += influence;
+            }
+        }
+
+        imageDenoise[index] = sumColor / sumWeight;
+
+    }
+}
+
+void denoise(const int filterSize, const float cPhi, const float pPhi, const float nPhi) {
+
+    if (filterSize < 25) return; 
+    
+    const Camera& cam = hst_scene->state.camera;
+
+    const dim3 blockSize2d(BLOCK_LENGTH, BLOCK_LENGTH);
+    const dim3 blocksPerGrid2d(
+        (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+        (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+    cudaMemcpy(dev_imageDenoiseDup, dev_image, cam.resolution.x * cam.resolution.y * sizeof(glm::vec3), cudaMemcpyDeviceToDevice);
+
+    for (int step = 1; step <= std::floor(std::sqrt(filterSize)); step++) {
+        denoiseIter << <blocksPerGrid2d, blockSize2d >> > (cam, 1 << step, cPhi, pPhi, nPhi, dev_gaussian, dev_imageDenoise, dev_imageDenoiseDup, dev_gBuffer);
+        std::swap(dev_imageDenoise, dev_imageDenoiseDup);
+    }
+    cudaDeviceSynchronize(); 
+}   
\ No newline at end of file
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 9e12f44..5a26cdf 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -5,6 +5,9 @@
 
 void pathtraceInit(Scene *scene);
 void pathtraceFree();
-void pathtrace(int frame, int iteration);
+int  pathtrace(int frame, int iteration, int lastIter);
 void showGBuffer(uchar4 *pbo);
-void showImage(uchar4 *pbo, int iter);
+void showImage(uchar4 *pbo, int iter, bool ui_denoise);
+
+void denoise(const int filterSize, const float cPhi, const float pPhi, const float nPhi);
+
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da7e558..7f1ddcb 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -78,5 +78,8 @@ struct ShadeableIntersection {
 // CHECKITOUT - a simple struct for storing scene geometry information per-pixel.
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
-  float t;
+    float t;
+    glm::vec3 pos; // Positions
+    glm::vec3 nor; // Normals 
+                   // Optional diffuse determined from pathtrace.cu's finalGather
 };
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
new file mode 100644
index 0000000..062427d
--- /dev/null
+++ b/stream_compaction/common.cu
@@ -0,0 +1,40 @@
+#include "common.h"
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int result = idata[index] != 0; 
+            bools1[index] = result;
+            bools2[index] = result; 
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index]) {
+                odata[indices[index]] = idata[index];
+            }
+        }
+    }
+}
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
new file mode 100644
index 0000000..74fe808
--- /dev/null
+++ b/stream_compaction/common.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define useCommon 1
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+        /**
+        * This class is used for timing the performance
+        * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+        */
+        class PerformanceTimer
+        {
+        public:
+            PerformanceTimer()
+            {
+                cudaEventCreate(&event_start);
+                cudaEventCreate(&event_end);
+            }
+
+            ~PerformanceTimer()
+            {
+                cudaEventDestroy(event_start);
+                cudaEventDestroy(event_end);
+            }
+
+            void startCpuTimer()
+            {
+                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                cpu_timer_started = true;
+
+                time_start_cpu = std::chrono::high_resolution_clock::now();
+            }
+
+            void endCpuTimer()
+            {
+                time_end_cpu = std::chrono::high_resolution_clock::now();
+
+                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+                prev_elapsed_time_cpu_milliseconds =
+                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+                cpu_timer_started = false;
+            }
+
+            void startGpuTimer()
+            {
+                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                gpu_timer_started = true;
+
+                cudaEventRecord(event_start);
+            }
+
+            void endGpuTimer()
+            {
+                cudaEventRecord(event_end);
+                cudaEventSynchronize(event_end);
+
+                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+                gpu_timer_started = false;
+            }
+
+            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+            {
+                return prev_elapsed_time_cpu_milliseconds;
+            }
+
+            float getGpuElapsedTimeForPreviousOperation() //noexcept
+            {
+                return prev_elapsed_time_gpu_milliseconds;
+            }
+
+            // remove copy and move functions
+            PerformanceTimer(const PerformanceTimer&) = delete;
+            PerformanceTimer(PerformanceTimer&&) = delete;
+            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+        private:
+            cudaEvent_t event_start = nullptr;
+            cudaEvent_t event_end = nullptr;
+
+            using time_point_t = std::chrono::high_resolution_clock::time_point;
+            time_point_t time_start_cpu;
+            time_point_t time_end_cpu;
+
+            bool cpu_timer_started = false;
+            bool gpu_timer_started = false;
+
+            float prev_elapsed_time_cpu_milliseconds = 0.f;
+            float prev_elapsed_time_gpu_milliseconds = 0.f;
+        };
+    }
+}