diff --git a/README.md b/README.md
index f044c821..ec3abe8f 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,56 @@ CUDA Denoiser For CUDA Path Tracer
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yiyang Chen
+  * [LinkedIn](https://www.linkedin.com/in/yiyang-chen-6a7641210/), [personal website](https://cyy0915.github.io/)
+* Tested on: Windows 10, i5-8700k @ 3.7GHz, GTX 1080, personal computer
 
-### (TODO: Your README)
+### Gbuffer visualize
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+| normal | position | time-of-flight |
+|---|---|---|
+|![](img/gbuffer_normal.png)|![](img/gbuffer_pos.png)|![](img/gbuffer_t.png)|
 
+### Denoise result
+
+| no denoise | naive denoise | edge-avoiding denoise |
+|---|---|---|
+|![](img/no.png)|![](img/naive.png)|![](img/edgeAvoiding.png)|
+
+
+P.S. the filter size of the 2 denoise images is 2, and color weight  0.45, normal weight 0.35, position weight 0.2
+
+### Performance analysis
+
+* how much time denoising adds to your renders
+
+    * Rendering takes about 26000 microseconds and denoising takes only about 5 microseconds when the filter size is 10, which is very fast. 
+
+* how denoising influences the number of iterations needed to get an "acceptably smooth" result
+
+![](img/number_of_iteration.png)
+| no denoise (100 iterations) | denoise (20 iterations) |
+|---|---|
+|![](img/100.png)|![](img/20.png)|
+
+The above 2 images are a little different, but I think they are all smooth enough.
+
+* how varying filter sizes affect performance
+    * The time increases when filter size becomes larger.
+![](img/filtersize.png)
+
+* how visual results vary with filter size -- does the visual quality scale uniformly with filter size?
+
+    * I found that filter size 1 is much more smooth than filter size 0 (no denoise), and when filter size is 1, 2, 4, 8, I can see the visual quality improves a little. However, when filter size is larger than 10, the visual quality seems almost the same. I think it's because the weight of the far pixel is very small.
+
+* how effective/ineffective is this method with different material types
+
+    * I think it's effective with the material that has no texture map or the color is almost the same. Because if the color change quickly in space with similar normal and position, the denoise function may blur the material a little.
+
+* how do results compare across different scenes - for example, between cornell.txt and cornell_ceiling_light.txt. Does one scene produce better denoised results? Why or why not?
+    
+    * cornell_ceiling_light.txt produce better denoise results than cornell.txt in 10 iteration. I think it's because of the size of light, that is to say, a big light makes the rendering converge faster, so the denoise result is also better.
+
+| cornell (10 iterations) | cornell_ceiling_light (10 iterations) |
+|---|---|
+|![](img/cornell.png)|![](img/edgeAvoiding.png)|
\ No newline at end of file
diff --git a/img/100.png b/img/100.png
new file mode 100644
index 00000000..b67de742
Binary files /dev/null and b/img/100.png differ
diff --git a/img/20.png b/img/20.png
new file mode 100644
index 00000000..287fa860
Binary files /dev/null and b/img/20.png differ
diff --git a/img/cornell.png b/img/cornell.png
new file mode 100644
index 00000000..77984ce0
Binary files /dev/null and b/img/cornell.png differ
diff --git a/img/edgeAvoiding.png b/img/edgeAvoiding.png
new file mode 100644
index 00000000..1799364c
Binary files /dev/null and b/img/edgeAvoiding.png differ
diff --git a/img/filtersize.png b/img/filtersize.png
new file mode 100644
index 00000000..f4299d12
Binary files /dev/null and b/img/filtersize.png differ
diff --git a/img/gbuffer_normal.png b/img/gbuffer_normal.png
new file mode 100644
index 00000000..e5e471ee
Binary files /dev/null and b/img/gbuffer_normal.png differ
diff --git a/img/gbuffer_pos.png b/img/gbuffer_pos.png
new file mode 100644
index 00000000..01363ca6
Binary files /dev/null and b/img/gbuffer_pos.png differ
diff --git a/img/gbuffer_t.png b/img/gbuffer_t.png
new file mode 100644
index 00000000..fc05b87e
Binary files /dev/null and b/img/gbuffer_t.png differ
diff --git a/img/naive.png b/img/naive.png
new file mode 100644
index 00000000..d9576e89
Binary files /dev/null and b/img/naive.png differ
diff --git a/img/no.png b/img/no.png
new file mode 100644
index 00000000..2e0ea101
Binary files /dev/null and b/img/no.png differ
diff --git a/img/number_of_iteration.png b/img/number_of_iteration.png
new file mode 100644
index 00000000..d44ba202
Binary files /dev/null and b/img/number_of_iteration.png differ
diff --git a/scenes/cornell.txt b/scenes/cornell.txt
index 83ff8202..77ad5512 100644
--- a/scenes/cornell.txt
+++ b/scenes/cornell.txt
@@ -52,7 +52,7 @@ EMITTANCE   0
 CAMERA
 RES         800 800
 FOVY        45
-ITERATIONS  5000
+ITERATIONS  10
 DEPTH       8
 FILE        cornell
 EYE         0.0 5 10.5
diff --git a/scenes/cornell_ceiling_light.txt b/scenes/cornell_ceiling_light.txt
index 15af5f19..72fe9306 100644
--- a/scenes/cornell_ceiling_light.txt
+++ b/scenes/cornell_ceiling_light.txt
@@ -52,7 +52,7 @@ EMITTANCE   0
 CAMERA
 RES         800 800
 FOVY        45
-ITERATIONS  10
+ITERATIONS  20
 DEPTH       8
 FILE        cornell
 EYE         0.0 5 10.5
diff --git a/scenes/my_cornell.txt b/scenes/my_cornell.txt
new file mode 100644
index 00000000..8ea5c4c8
--- /dev/null
+++ b/scenes/my_cornell.txt
@@ -0,0 +1,148 @@
+// Emissive material (light)
+MATERIAL 0
+RGB         1 1 1
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   5
+
+// Diffuse white
+MATERIAL 1
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse red
+MATERIAL 2
+RGB         .85 .35 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Diffuse green
+MATERIAL 3
+RGB         .35 .85 .35
+SPECEX      0
+SPECRGB     0 0 0
+REFL        0
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white 1
+MATERIAL 4
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white 2
+MATERIAL 5
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        0.5
+REFR        0
+REFRIOR     0
+EMITTANCE   0
+
+// Specular white 3
+MATERIAL 6
+RGB         .98 .98 .98
+SPECEX      0
+SPECRGB     .98 .98 .98
+REFL        1
+REFR        1
+REFRIOR     1.5
+EMITTANCE   0
+REFRRGB     .98 .98 .98
+
+// Camera
+CAMERA
+RES         800 800
+FOVY        45
+ITERATIONS  10
+DEPTH       8
+FILE        cornell
+EYE         0.0 5 10.5
+LOOKAT      0 5 0
+UP          0 1 0
+LENS        0.025
+FOCAL       10
+
+
+// Ceiling light
+OBJECT 0
+cube
+material 0
+TRANS       0 10 0
+ROTAT       0 0 0
+SCALE       3 .3 3
+
+// Floor
+OBJECT 1
+cube
+material 1
+TRANS       0 0 0
+ROTAT       0 0 0
+SCALE       10 .01 10
+
+// Ceiling
+OBJECT 2
+cube
+material 1
+TRANS       0 10 0
+ROTAT       0 0 90
+SCALE       .01 10 10
+
+// Back wall
+OBJECT 3
+cube
+material 1
+TRANS       0 5 -5
+ROTAT       0 90 0
+SCALE       .01 10 10
+
+// Left wall
+OBJECT 4
+cube
+material 2
+TRANS       -5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Right wall
+OBJECT 5
+cube
+material 3
+TRANS       5 5 0
+ROTAT       0 0 0
+SCALE       .01 10 10
+
+// Sphere 1
+OBJECT 6
+sphere
+material 4
+TRANS       -3 2 -1
+ROTAT       0 0 0
+SCALE       3 3 3
+
+// Sphere 2
+OBJECT 7
+sphere
+material 5
+TRANS       0 4 -1
+ROTAT       0 0 0
+SCALE       3 3 3
diff --git a/src/denoise.h b/src/denoise.h
new file mode 100644
index 00000000..45dcbb04
--- /dev/null
+++ b/src/denoise.h
@@ -0,0 +1,3 @@
+#pragma once
+
+
diff --git a/src/interactions.h b/src/interactions.h
index 144a9f5b..d0744a08 100644
--- a/src/interactions.h
+++ b/src/interactions.h
@@ -2,30 +2,61 @@
 
 #include "intersections.h"
 
-/**
- * Computes a cosine-weighted random direction in a hemisphere.
- * Used for diffuse lighting.
- */
 __host__ __device__
-glm::vec3 calculateRandomDirectionInHemisphere(
-        glm::vec3 normal, thrust::default_random_engine &rng) {
-    thrust::uniform_real_distribution<float> u01(0, 1);
-
-    float up = sqrt(u01(rng)); // cos(theta)
-    float over = sqrt(1 - up * up); // sin(theta)
-    float around = u01(rng) * TWO_PI;
+glm::vec3 squareToDiskUniform(const glm::vec2& sample)
+{
+    float phi, r, u, v;
+    r = sqrt(sample.x);
+    phi = 2 * PI * sample.y;
+    u = r * cos(phi);
+    v = r * sin(phi);
+    return glm::vec3(u, v, 0);
+}
 
-    // Find a direction that is not the normal based off of whether or not the
-    // normal's components are all equal to sqrt(1/3) or whether or not at
-    // least one component is less than sqrt(1/3). Learned this trick from
-    // Peter Kutz.
+__host__ __device__
+glm::vec3 squareToDiskConcentric(const glm::vec2& sample)
+{
+    float phi, r, u, v;
+    float a = 2 * sample.x - 1;
+    float b = 2 * sample.y - 1;
+    if (a > -b) { // region 1 or 2
+        if (a > b) {// region 1, also |a| > |b|
+            r = a;
+            phi = (PI / 4) * (b / a);
+        }
+        else {// region 2, also |b| > |a|
+            r = b;
+            phi = (PI / 4) * (2 - (a / b));
+        }
+    }
+    else {// region 3 or 4
+        if (a < b) { // region 3, also |a| >= |b|, a != 0
+            r = -a;
+            phi = (PI / 4) * (4 + (b / a));
+        }
+        else {// region 4, |b| >= |a|, but a==0 and b==0 could occur.
+            r = -b;
+            if (b != 0)
+                phi = (PI / 4) * (6 - (a / b));
+            else
+                phi = 0;
+        }
+    }
+    u = r * cos(phi);
+    v = r * sin(phi);
+    return glm::vec3(u, v, 0);
+}
 
+__host__ __device__
+glm::vec3 localToWorldWithNormal(glm::vec3 pos, glm::vec3 normal) {
     glm::vec3 directionNotNormal;
     if (abs(normal.x) < SQRT_OF_ONE_THIRD) {
         directionNotNormal = glm::vec3(1, 0, 0);
-    } else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
+    }
+    else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
         directionNotNormal = glm::vec3(0, 1, 0);
-    } else {
+    }
+    else {
         directionNotNormal = glm::vec3(0, 0, 1);
     }
 
@@ -35,28 +66,234 @@ glm::vec3 calculateRandomDirectionInHemisphere(
     glm::vec3 perpendicularDirection2 =
         glm::normalize(glm::cross(normal, perpendicularDirection1));
 
-    return up * normal
-        + cos(around) * over * perpendicularDirection1
-        + sin(around) * over * perpendicularDirection2;
+    return pos.x * perpendicularDirection1 + pos.y * perpendicularDirection2 + pos.z * normal;
 }
 
+// CHECKITOUT
 /**
- * Simple ray scattering with diffuse and perfect specular support.
+ * Computes a cosine-weighted random direction in a hemisphere.
+ * Used for diffuse lighting.
+ */
+__host__ __device__
+glm::vec3 calculateRandomDirectionInHemisphere(
+    glm::vec3 normal, thrust::default_random_engine& rng, float& pdf) {
+    thrust::uniform_real_distribution<float> u01(0, 1);
+
+    glm::vec3 pos = squareToDiskConcentric(glm::vec2(u01(rng), u01(rng)));
+    pos.z = sqrt(1 - pos.x * pos.x - pos.y * pos.y);
+    pdf = pos.z * INV_PI;
+    return localToWorldWithNormal(pos, normal);
+
+    //float up = sqrt(u01(rng)); // cos(theta)
+    //float over = sqrt(1 - up * up); // sin(theta)
+    //float around = u01(rng) * TWO_PI;
+
+    //// Find a direction that is not the normal based off of whether or not the
+    //// normal's components are all equal to sqrt(1/3) or whether or not at
+    //// least one component is less than sqrt(1/3). Learned this trick from
+    //// Peter Kutz.
+
+    //glm::vec3 directionNotNormal;
+    //if (abs(normal.x) < SQRT_OF_ONE_THIRD) {
+    //    directionNotNormal = glm::vec3(1, 0, 0);
+    //} else if (abs(normal.y) < SQRT_OF_ONE_THIRD) {
+    //    directionNotNormal = glm::vec3(0, 1, 0);
+    //} else {
+    //    directionNotNormal = glm::vec3(0, 0, 1);
+    //}
+
+    //// Use not-normal direction to generate two perpendicular directions
+    //glm::vec3 perpendicularDirection1 =
+    //    glm::normalize(glm::cross(normal, directionNotNormal));
+    //glm::vec3 perpendicularDirection2 =
+    //    glm::normalize(glm::cross(normal, perpendicularDirection1));
+
+    //pdf = up * INV_PI;
+    //return up * normal
+    //    + cos(around) * over * perpendicularDirection1
+    //    + sin(around) * over * perpendicularDirection2;
+}
+
+__host__ __device__
+glm::vec3 calculateRandomDirectionInSpecularLobe(
+    glm::vec3 wiCenter, float specex, thrust::default_random_engine& rng, float& pdf) {
+    thrust::uniform_real_distribution<float> u01(0, 1);
+
+    float up = powf(u01(rng), 1.f / (specex + 1.f)); // cos(alpha)
+    float over = sqrt(1.f - up * up); // sin(alpha)
+    float around = u01(rng) * TWO_PI;
+
+    pdf = (specex + 1) * powf(up, specex) * over / TWO_PI;
+    return localToWorldWithNormal(glm::vec3(cos(around) * over, sin(around) * over, up), wiCenter);
+
+}
+
+__host__ __device__
+float FrDielectric(float cosThetaI, float etaI, float etaT)
+{
+    cosThetaI = glm::clamp(cosThetaI, -1.f, 1.f);
+    if (cosThetaI <= 0.f)
+    {
+        float tmp = etaI;
+        etaI = etaT;
+        etaT = tmp;
+        cosThetaI = abs(cosThetaI);
+    }
+
+    float sinThetaI = sqrt(glm::max((float)0, 1 - cosThetaI * cosThetaI));
+    float sinThetaT = etaI / etaT * sinThetaI;
+    if (sinThetaT >= 1) {
+        return 1.f;
+    }
+
+    float cosThetaT = sqrt(glm::max((float)0, 1 - sinThetaT * sinThetaT));
+    float Rparl = ((etaT * cosThetaI) - (etaI * cosThetaT)) / ((etaT * cosThetaI) + (etaI * cosThetaT));
+    float Rperp = ((etaI * cosThetaI) - (etaT * cosThetaT)) / ((etaI * cosThetaI) + (etaT * cosThetaT));
+    return (Rparl * Rparl + Rperp * Rperp) / 2;
+}
+
+__host__ __device__
+bool Refract(const glm::vec3& wi, const glm::vec3& n, float eta,
+    glm::vec3* wt) {
+    // Compute cos theta using Snell's law
+    float cosThetaI = glm::dot(n, wi);
+    float sin2ThetaI = glm::max(float(0), float(1 - cosThetaI * cosThetaI));
+    float sin2ThetaT = eta * eta * sin2ThetaI;
+
+    // Handle total internal reflection for transmission
+    if (sin2ThetaT >= 1) return false;
+    float cosThetaT = sqrt(1 - sin2ThetaT);
+    *wt = eta * -wi + (eta * cosThetaI - cosThetaT) * glm::vec3(n);
+    return true;
+}
+
+/**
+ * Scatter a ray with some probabilities according to the material properties.
+ * For example, a diffuse surface scatters in a cosine-weighted hemisphere.
+ * A perfect specular surface scatters in the reflected ray direction.
+ * In order to apply multiple effects to one surface, probabilistically choose
+ * between them.
+ *
+ * The visual effect you want is to straight-up add the diffuse and specular
+ * components. You can do this in a few ways. This logic also applies to
+ * combining other types of materias (such as refractive).
+ *
+ * - Always take an even (50/50) split between a each effect (a diffuse bounce
+ *   and a specular bounce), but divide the resulting color of either branch
+ *   by its probability (0.5), to counteract the chance (0.5) of the branch
+ *   being taken.
+ *   - This way is inefficient, but serves as a good starting point - it
+ *     converges slowly, especially for pure-diffuse or pure-specular.
+ * - Pick the split based on the intensity of each material color, and divide
+ *   branch result by that branch's probability (whatever probability you use).
+ *
+ * This method applies its changes to the Ray parameter `ray` in place.
+ * It also modifies the color `color` of the ray in place.
+ *
+ * You may need to change the parameter list for your purposes!
  */
 __host__ __device__
 void scatterRay(
-		PathSegment & pathSegment,
-        glm::vec3 intersect,
-        glm::vec3 normal,
-        const Material &m,
-        thrust::default_random_engine &rng) {
-    glm::vec3 newDirection;
-    if (m.hasReflective) {
-        newDirection = glm::reflect(pathSegment.ray.direction, normal);
-    } else {
-        newDirection = calculateRandomDirectionInHemisphere(normal, rng);
-    }
-
-    pathSegment.ray.direction = newDirection;
-    pathSegment.ray.origin = intersect + (newDirection * 0.0001f);
+    PathSegment& pathSegment,
+    glm::vec3 intersect,
+    glm::vec3 normal,
+    const Material& m,
+    thrust::default_random_engine& rng) {
+    // TODO: implement this.
+    // A basic implementation of pure-diffuse shading will just call the
+    // calculateRandomDirectionInHemisphere defined above.
+    /*if (pathSegment.remainingBounces < 0) {
+        int a = pathSegment.remainingBounces;
+        return;
+    }   */
+    //todo
+
+
+    glm::vec3 scatterDir;
+    float pdf = 0.f;
+    glm::vec3 color(0.f);
+
+    if (!m.hasReflective && !m.hasRefractive) {     //pure diffuse
+        scatterDir = calculateRandomDirectionInHemisphere(normal, rng, pdf);
+        float cosine = glm::dot(normal, scatterDir);
+        color = glm::max(cosine, 0.f) * m.color * INV_PI;
+    }
+    else if (m.hasReflective > 0 && m.hasReflective < 1) {      //imperfect reflection
+        thrust::uniform_real_distribution<float> u01(0, 1);
+        float randNum = u01(rng);
+        float frac = m.hasReflective;
+
+        if (randNum < frac) {
+            scatterDir = calculateRandomDirectionInHemisphere(normal, rng, pdf);
+            float cosine = glm::dot(normal, scatterDir);
+            color = glm::max(cosine, 0.f) * m.color * INV_PI * frac;
+            pdf *= frac;
+        }
+        else {
+            glm::vec3 wiCenter = glm::reflect(pathSegment.ray.direction, normal);
+            scatterDir = calculateRandomDirectionInSpecularLobe(wiCenter, m.specular.exponent, rng, pdf);
+            if (glm::dot(normal, scatterDir) <= 0) {
+                pdf = 0;
+            }
+            else {
+                float cosRI = glm::dot(scatterDir, wiCenter);
+                color = m.specular.color * powf(cosRI, m.specular.exponent) * INV_PI * (1 - frac);
+                pdf *= (1 - frac);
+            }
+        }
+    }
+    else if (m.hasReflective == 1 && !m.hasRefractive) {    //perfect reflection
+        scatterDir = glm::reflect(pathSegment.ray.direction, normal);
+        pdf = 1;
+        color = m.specular.color;
+    }
+    else if (m.hasReflective == 1 && m.hasRefractive == 1) {    //reflection and refraction, like glass
+        thrust::uniform_real_distribution<float> u01(0, 1);
+        float randNum = u01(rng);
+
+        if (randNum < 0.5) {
+            scatterDir = glm::reflect(pathSegment.ray.direction, normal);
+            pdf = 0.5;
+            float cosine = glm::dot(scatterDir, normal);
+            float fresnel = FrDielectric(cosine, 1, m.indexOfRefraction);
+            color = fresnel * m.specular.color;
+        }
+        else {
+            float eta = m.indexOfRefraction;
+            glm::vec3 trueNormal = normal;
+            if (glm::dot(pathSegment.ray.direction, normal) > 0) {
+                eta = 1.f / eta;
+                trueNormal = -normal;
+            }
+
+            glm::vec3 refractDir;
+            bool fullReflect = !Refract(-pathSegment.ray.direction, trueNormal, 1.f / eta, &refractDir);
+            if (fullReflect) {
+                scatterDir = glm::reflect(pathSegment.ray.direction, trueNormal);
+                pdf = 0.5;
+                float cosine = glm::dot(scatterDir, trueNormal);
+                float fresnel = FrDielectric(cosine, 1, eta);
+                color = fresnel * m.specular.color;
+            }
+            else {
+                scatterDir = refractDir;
+                pdf = 0.5;
+                float cosine = glm::dot(scatterDir, trueNormal);
+                float fresnel = (1 - FrDielectric(cosine, 1, eta));
+                color = fresnel * m.refractionColor;
+            }
+        }
+    }
+
+    if (pdf < 0.01f) {
+        pathSegment.color = glm::vec3(0.f);
+    }
+    else {
+        pathSegment.color *= color / pdf;
+    }
+    pathSegment.ray.direction = scatterDir;
+    pathSegment.ray.origin = intersect + scatterDir * 0.01f;
+    pathSegment.remainingBounces--;
+
+
 }
diff --git a/src/main.cpp b/src/main.cpp
index 4092ae4a..63508b92 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -23,12 +23,13 @@ int ui_iterations = 0;
 int startupIterations = 0;
 int lastLoopIterations = 0;
 bool ui_showGbuffer = false;
-bool ui_denoise = false;
-int ui_filterSize = 80;
+int ui_denoise = 0;
+int ui_filterSize = 30;
 float ui_colorWeight = 0.45f;
 float ui_normalWeight = 0.35f;
 float ui_positionWeight = 0.2f;
 bool ui_saveAndExit = false;
+bool ui_regenerate = false;
 
 static bool camchanged = true;
 static float dtheta = 0, dphi = 0;
@@ -112,7 +113,7 @@ void saveImage() {
 
     std::string filename = renderState->imageName;
     std::ostringstream ss;
-    ss << filename << "." << startTimeString << "." << samples << "samp";
+    ss << filename << "." << currentTimeString() << "." << samples << "samp";
     filename = ss.str();
 
     // CHECKITOUT
@@ -146,6 +147,11 @@ void runCuda() {
         camchanged = false;
       }
 
+    if (ui_regenerate) {
+        iteration = 0;
+        ui_regenerate = false;
+    }
+
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
@@ -162,7 +168,7 @@ void runCuda() {
 
         // execute the kernel
         int frame = 0;
-        pathtrace(frame, iteration);
+        pathtrace(frame, iteration, {ui_denoise, ui_filterSize, ui_colorWeight, ui_normalWeight, ui_positionWeight });
     }
 
     if (ui_showGbuffer) {
@@ -176,9 +182,10 @@ void runCuda() {
 
     if (ui_saveAndExit) {
         saveImage();
-        pathtraceFree();
+        ui_saveAndExit = false;
+        /*pathtraceFree();
         cudaDeviceReset();
-        exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);*/
     }
 }
 
diff --git a/src/main.h b/src/main.h
index 06d311a8..aa2b0346 100644
--- a/src/main.h
+++ b/src/main.h
@@ -35,12 +35,13 @@ extern int height;
 extern int ui_iterations;
 extern int startupIterations;
 extern bool ui_showGbuffer;
-extern bool ui_denoise;
+extern int ui_denoise;
 extern int ui_filterSize;
 extern float ui_colorWeight;
 extern float ui_normalWeight;
 extern float ui_positionWeight;
 extern bool ui_saveAndExit;
+extern bool ui_regenerate;
 
 void runCuda();
 void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods);
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 23e5f909..aa635bbb 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -4,6 +4,10 @@
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
 #include <thrust/remove.h>
+#include <thrust/device_ptr.h>
+#include <thrust/partition.h>
+#include <thrust/sort.h>
+#include <chrono>
 
 #include "sceneStructs.h"
 #include "scene.h"
@@ -13,6 +17,15 @@
 #include "pathtrace.h"
 #include "intersections.h"
 #include "interactions.h"
+using namespace std::chrono;
+
+//option
+#define CACHE_FIRST_INTERSECTION 1
+#define MATERIAL_CONTIGUOUS 0
+#define ANTIALIASING 1
+#define DEPTH_OF_FIELD 0
+
+#define ENABLE_CACHE_FIRST_INTERSECTION (CACHE_FIRST_INTERSECTION && !ANTIALIASING && !DEPTH_OF_FIELD)
 
 #define ERRORCHECK 1
 
@@ -44,6 +57,17 @@ thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int de
     return thrust::default_random_engine(h);
 }
 
+//for stream compaction
+struct shouldContinue
+{
+	__host__ __device__
+		bool operator()(const PathSegment x)
+	{
+		bool stop = x.remainingBounces <= 0 || (x.color.r < EPSILON&& x.color.b < EPSILON&& x.color.g < EPSILON);
+		return !stop;
+	}
+};
+
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
         int iter, glm::vec3* image) {
@@ -67,19 +91,35 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
     }
 }
 
-__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer, int type) {
     int x = (blockIdx.x * blockDim.x) + threadIdx.x;
     int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        float timeToIntersect = gBuffer[index].t * 256.0;
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
 
-        pbo[index].w = 0;
-        pbo[index].x = timeToIntersect;
-        pbo[index].y = timeToIntersect;
-        pbo[index].z = timeToIntersect;
-    }
+		if (type == 0) {
+			pbo[index].w = 0;
+			pbo[index].x = gBuffer[index].normal.x * 128 + 128;
+			pbo[index].y = gBuffer[index].normal.y * 128 + 128;
+			pbo[index].z = gBuffer[index].normal.z * 128 + 128;
+		}
+		else if (type == 1) {
+			float timeToIntersect = gBuffer[index].t * 256.0;
+
+			pbo[index].w = 0;
+			pbo[index].x = timeToIntersect;
+			pbo[index].y = timeToIntersect;
+			pbo[index].z = timeToIntersect;
+		}
+		else if (type == 2) {
+			glm::vec3 scalePos = gBuffer[index].position / 15.f * 255.f;
+			pbo[index].w = 0;
+			pbo[index].x = scalePos.x;
+			pbo[index].y = scalePos.y;
+			pbo[index].z = scalePos.z;
+		}
+	}
 }
 
 static Scene * hst_scene = NULL;
@@ -91,6 +131,11 @@ static ShadeableIntersection * dev_intersections = NULL;
 static GBufferPixel* dev_gBuffer = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
+#if ENABLE_CACHE_FIRST_INTERSECTION
+static ShadeableIntersection* dev_cacheIntersections = NULL;
+#endif // ENABLE_CACHE_FIRST_INTERSECTION
+static glm::vec3* dev_denoiseBuffer = NULL;
+static float* dev_denoiseKernel = NULL;
 
 void pathtraceInit(Scene *scene) {
     hst_scene = scene;
@@ -114,6 +159,22 @@ void pathtraceInit(Scene *scene) {
     cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
 
     // TODO: initialize any extra device memeory you need
+#if ENABLE_CACHE_FIRST_INTERSECTION
+	cudaMalloc(&dev_cacheIntersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_cacheIntersections, 0, pixelcount * sizeof(ShadeableIntersection));
+#endif // ENABLE_CACHE_FIRST_INTERSECTION
+
+	cudaMalloc(&dev_denoiseBuffer, pixelcount * sizeof(glm::vec3));
+	cudaMemset(dev_denoiseBuffer, 0, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_denoiseKernel, 25 * sizeof(float));
+	float tmpDenoiseKernel1D[] = { 1.f / 16.f, 1.f / 4.f, 3.f / 8.f, 1.f / 4.f, 1.f / 16.f };
+	float tmpDenoiseKernel2D[5][5];
+	for (size_t i = 0; i < 5; ++i) {
+		for (size_t j = 0; j < 5; ++j) {
+			tmpDenoiseKernel2D[i][j] = tmpDenoiseKernel1D[i] * tmpDenoiseKernel1D[j];
+		}
+	}
+	cudaMemcpy(dev_denoiseKernel, tmpDenoiseKernel2D, sizeof(float) * 5 * 5, cudaMemcpyHostToDevice);
 
     checkCUDAError("pathtraceInit");
 }
@@ -126,6 +187,11 @@ void pathtraceFree() {
   	cudaFree(dev_intersections);
     cudaFree(dev_gBuffer);
     // TODO: clean up any extra device memory you created
+#if ENABLE_CACHE_FIRST_INTERSECTION
+	cudaFree(dev_cacheIntersections);
+#endif // ENABLE_CACHE_FIRST_INTERSECTION
+	cudaFree(dev_denoiseBuffer);
+	cudaFree(dev_denoiseKernel);
 
     checkCUDAError("pathtraceFree");
 }
@@ -140,26 +206,52 @@ void pathtraceFree() {
 */
 __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments)
 {
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
-	if (x < cam.resolution.x && y < cam.resolution.y) {
-		int index = x + (y * cam.resolution.x);
-		PathSegment & segment = pathSegments[index];
 
-		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+    if (x < cam.resolution.x && y < cam.resolution.y) {
+        int index = x + (y * cam.resolution.x);
+        PathSegment& segment = pathSegments[index];
 
-		segment.ray.direction = glm::normalize(cam.view
-			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
-			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
+        thrust::default_random_engine rng = makeSeededRandomEngine(iter, cam.resolution.y * cam.resolution.x - index, 0);
+        thrust::uniform_real_distribution<float> u01(0, 1);
 
-		segment.pixelIndex = index;
-		segment.remainingBounces = traceDepth;
-	}
+        segment.ray.origin = cam.position;
+        segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+        float pixelx = x, pixely = y;
+        glm::mat4 cameraToWorld(glm::vec4(cam.right, 0), glm::vec4(cam.up, 0), glm::vec4(cam.view, 0), glm::vec4(cam.position, 1));
+
+#if ANTIALIASING
+        pixelx = x + u01(rng) - 0.5;
+        pixely = y + u01(rng) - 0.5;
+#endif
+
+#if DEPTH_OF_FIELD
+        float phi, r, u, v;
+        r = sqrt(u01(rng));
+        phi = TWO_PI * u01(rng);
+        u = r * cos(phi);
+        v = r * sin(phi);
+        glm::vec3 pLens = cam.lensRadius * glm::vec3(u, v, 0);
+        glm::vec3 pPixel = glm::vec3(-cam.pixelLength.x * (pixelx - (float)cam.resolution.x * 0.5f), -cam.pixelLength.y * (pixely - (float)cam.resolution.y * 0.5f), 1);
+        glm::vec3 pFocus = cam.focalDistance * pPixel;
+        segment.ray.origin = glm::vec3(cameraToWorld * glm::vec4(pLens, 1));
+        segment.ray.direction = glm::normalize(glm::mat3(cameraToWorld) * (pFocus - pLens));
+#else
+        glm::vec3 pPixel = glm::vec3(-cam.pixelLength.x * (pixelx - (float)cam.resolution.x * 0.5f), -cam.pixelLength.y * (pixely - (float)cam.resolution.y * 0.5f), 1);
+        segment.ray.direction = glm::mat3(cameraToWorld) * pPixel;
+        /*segment.ray.direction = glm::normalize(cam.view
+            - cam.right * cam.pixelLength.x * (pixelx - (float)cam.resolution.x * 0.5f)
+            - cam.up * cam.pixelLength.y * (pixely - (float)cam.resolution.y * 0.5f)
+        );*/
+#endif
+        segment.pixelIndex = index;
+        segment.remainingBounces = traceDepth;
+    }
 }
 
+
 __global__ void computeIntersections(
 	int depth
 	, int num_paths
@@ -231,46 +323,51 @@ __global__ void shadeSimpleMaterials (
 	, ShadeableIntersection * shadeableIntersections
 	, PathSegment * pathSegments
 	, Material * materials
+	, int depth
+	, GBufferPixel* gbuffer
 	)
 {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    PathSegment segment = pathSegments[idx];
-    if (segment.remainingBounces == 0) {
-      return;
-    }
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < num_paths)
+    {
+        ShadeableIntersection intersection = shadeableIntersections[idx];
+        if (intersection.t > 0.0f) { // if the intersection exists...
+          // Set up the RNG
+          // LOOK: this is how you use thrust's RNG! Please look at
+          // makeSeededRandomEngine as well.
+            thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, 0);
+            thrust::uniform_real_distribution<float> u01(0, 1);
+
+            Material material = materials[intersection.materialId];
+            glm::vec3 materialColor = material.color;
+			if (depth == 1) {
+				gbuffer[idx].color = materialColor;
+			}
 
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      segment.remainingBounces--;
-      // Set up the RNG
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        segment.color *= (materialColor * material.emittance);
-        segment.remainingBounces = 0;
-      }
-      else {
-        segment.color *= materialColor;
-        glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
-        scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      segment.color = glm::vec3(0.0f);
-      segment.remainingBounces = 0;
+            // If the material indicates that the object was a light, "light" the ray
+            if (material.emittance > 0.0f) {
+                pathSegments[idx].color *= (materialColor * material.emittance);
+                pathSegments[idx].remainingBounces = 0;		//stop when hit light source
+            }
+            // Otherwise, do some pseudo-lighting computation. This is actually more
+            // like what you would expect from shading in a rasterizer like OpenGL.
+            // TODO: replace this! you should be able to start with basically a one-liner
+            else {
+                glm::vec3 isectPoint = getPointOnRay(pathSegments[idx].ray, intersection.t);
+                scatterRay(pathSegments[idx], isectPoint, intersection.surfaceNormal, material, rng);
+                //float lightTerm = glm::dot(intersection.surfaceNormal, glm::vec3(0.0f, 1.0f, 0.0f));
+                //pathSegments[idx].color *= (materialColor * lightTerm) * 0.3f + ((1.0f - intersection.t * 0.02f) * materialColor) * 0.7f;
+                //pathSegments[idx].color *= u01(rng); // apply some noise because why not
+            }
+            // If there was no intersection, color the ray black.
+            // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+            // used for opacity, in which case they can indicate "no opacity".
+            // This can be useful for post-processing and image compositing.
+        }
+        else {
+            pathSegments[idx].color = glm::vec3(0.0f);
+        }
     }
-
-    pathSegments[idx] = segment;
-  }
 }
 
 __global__ void generateGBuffer (
@@ -282,9 +379,18 @@ __global__ void generateGBuffer (
   if (idx < num_paths)
   {
     gBuffer[idx].t = shadeableIntersections[idx].t;
+	gBuffer[idx].normal = shadeableIntersections[idx].surfaceNormal;
+	gBuffer[idx].position = shadeableIntersections[idx].t * pathSegments[idx].ray.direction + pathSegments[idx].ray.origin;
   }
 }
 
+__global__ void setGbufferColor(int pixelCount, glm::vec3* image, GBufferPixel* gbuffer) {
+	int idx = blockIdx.x * blockDim.x + threadIdx.x;
+	if (idx < pixelCount) {
+		gbuffer[idx].color = image[idx];
+	}
+}
+
 // Add the current iteration's output to the overall image
 __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
 {
@@ -297,56 +403,151 @@ __global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterati
 	}
 }
 
+__global__ void denoiseNaive(glm::vec3* image, glm::vec3* buffer, float* kernel, int pixelCount, glm::ivec2 resolution, int distance) {
+	int idxX = blockDim.x * blockIdx.x + threadIdx.x;
+	int idxY = blockDim.y * blockIdx.y + threadIdx.y;
+	int index = idxY * resolution.y + idxX;
+	if (idxX < resolution.x && idxY < resolution.y) {
+		float totalWeight = 0;
+		glm::vec3 color(0.f);
+
+		for (int dy = -2; dy <= 2; ++dy) {
+			int y = idxY + dy * distance;
+			if (y < 0 || y >= resolution.y) {
+				continue;
+			}
+
+			for (int dx = -2; dx <= 2; ++dx) {
+				int x = idxX + dx * distance;
+				if (x < 0 || x >= resolution.x) {
+					continue;
+				}
+
+				float weight = kernel[(dy + 2) * 5 + dx + 2];
+				color += image[y * resolution.y + x] * weight;
+				totalWeight += weight;
+			}
+		}
+		if (totalWeight == 0) {
+			color = glm::vec3(0.f);
+		}
+		else {
+			color /= totalWeight;
+		}
+		buffer[index] = color;
+	}
+}
+
+__device__ float computeLogWeight(glm::vec3& v0, glm::vec3& v1, float sigma2) {
+	return sigma2 == 0 ? 0 : -glm::length(v1 - v0) / sigma2;
+}
+
+__global__ void denoiseEdgeAvoiding(GBufferPixel* gBuffer, DenoiseParm dParm, glm::vec3* image, glm::vec3* buffer, float* kernel, int pixelCount, glm::ivec2 resolution, int distance) {
+	int idxX = blockDim.x * blockIdx.x + threadIdx.x;
+	int idxY = blockDim.y * blockIdx.y + threadIdx.y;
+	int index = idxY * resolution.y + idxX;
+	if (idxX < resolution.x && idxY < resolution.y) {
+		float totalWeight = 0;
+		glm::vec3 color(0.f);
+		GBufferPixel p0 = gBuffer[index];
+
+		for (int dy = -2; dy <= 2; ++dy) {
+			int y = idxY + dy * distance;
+			if (y < 0 || y >= resolution.y) {
+				continue;
+			}
+
+			for (int dx = -2; dx <= 2; ++dx) {
+				int x = idxX + dx * distance;
+				if (x < 0 || x >= resolution.x) {
+					continue;
+				}
+				GBufferPixel p1 = gBuffer[y * resolution.y + x];
+
+				float weight = kernel[(dy + 2) * 5 + dx + 2];
+				float logColorWeight = computeLogWeight(p0.color, p1.color, dParm.colorWeight * dParm.colorWeight);
+				float logNormalWeight = computeLogWeight(p0.normal, p1.normal, dParm.normalWeight * dParm.normalWeight);
+				float logPosWeight = computeLogWeight(p0.position, p1.position, dParm.positionWeight * dParm.positionWeight);
+				weight *= __expf(logColorWeight + logNormalWeight + logPosWeight);
+				color += image[y * resolution.y + x] * weight;
+				totalWeight += weight;
+			}
+		}
+		if (totalWeight == 0) {
+			color = glm::vec3(0.f);
+		}
+		else {
+			color /= totalWeight;
+		}
+		buffer[index] = color;
+	}
+}
+
+void denoise(int pixelCount, glm::ivec2 resolution, DenoiseParm dParm, dim3 blockSize2d, dim3 blocksPerGrid2d) {
+	for (int distance = 1; distance <= dParm.filterSize; distance *= 2) {
+		if (dParm.denoise == 1) {
+			denoiseNaive << <blocksPerGrid2d, blockSize2d >> > (dev_image, dev_denoiseBuffer, dev_denoiseKernel, pixelCount, resolution, distance);
+		}
+		else {
+			denoiseEdgeAvoiding << <blocksPerGrid2d, blockSize2d >> > (dev_gBuffer, dParm, dev_image, dev_denoiseBuffer, dev_denoiseKernel, pixelCount, resolution, distance);
+		}
+		glm::vec3* tmp = dev_image;
+		dev_image = dev_denoiseBuffer;
+		dev_denoiseBuffer = tmp;
+	}
+	
+}
+
 /**
  * Wrapper for the __global__ call that sets up the kernel calls and does a ton
  * of memory management
  */
-void pathtrace(int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+void pathtrace(int frame, int iter, DenoiseParm dParm) {
+	const int traceDepth = hst_scene->state.traceDepth;
+	const Camera& cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
 	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+	const dim3 blockSize2d(8, 8);
+	const dim3 blocksPerGrid2d(
+		(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+		(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 	// 1D block for path tracing
 	const int blockSize1d = 128;
 
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Pathtracing Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * NEW: For the first depth, generate geometry buffers (gbuffers)
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally:
-    //     * if not denoising, add this iteration's results to the image
-    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
+	///////////////////////////////////////////////////////////////////////////
+
+	// Recap:
+	// * Initialize array of path rays (using rays that come out of the camera)
+	//   * You can pass the Camera object to that kernel.
+	//   * Each path ray must carry at minimum a (ray, color) pair,
+	//   * where color starts as the multiplicative identity, white = (1, 1, 1).
+	//   * This has already been done for you.
+	// * For each depth:
+	//   * Compute an intersection in the scene for each path ray.
+	//     A very naive version of this has been implemented for you, but feel
+	//     free to add more primitives and/or a better algorithm.
+	//     Currently, intersection distance is recorded as a parametric distance,
+	//     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+	//     * Color is attenuated (multiplied) by reflections off of any object
+	//   * TODO: Stream compact away all of the terminated paths.
+	//     You may use either your implementation or `thrust::remove_if` or its
+	//     cousins.
+	//     * Note that you can't really use a 2D kernel launch any more - switch
+	//       to 1D.
+	//   * TODO: Shade the rays that intersected something or didn't bottom out.
+	//     That is, color the ray by performing a color computation according
+	//     to the shader, then generate a new ray to continue the ray path.
+	//     We recommend just updating the ray's PathSegment in place.
+	//     Note that this step may come before or after stream compaction,
+	//     since some shaders you write may also cause a path to terminate.
+	// * Finally, add this iteration's results to the image. This has been done
+	//   for you.
+
+	// TODO: perform one iteration of path tracing
+
+	generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
 	checkCUDAError("generate camera ray");
 
 	int depth = 0;
@@ -356,57 +557,134 @@ void pathtrace(int frame, int iter) {
 	// --- PathSegment Tracing Stage ---
 	// Shoot ray into scene, bounce between objects, push shading chunks
 
-  // Empty gbuffer
-  cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
+  	// Empty gbuffer
+  	cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
 
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
-
-  bool iterationComplete = false;
+	auto start = system_clock::now();
+	bool iterationComplete = false;
 	while (!iterationComplete) {
 
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
-
-  if (depth == 0) {
-    generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
-  }
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+
+#if ENABLE_CACHE_FIRST_INTERSECTION
+		if (depth == 0 && iter != 1) {
+			cudaMemcpy(dev_intersections, dev_cacheIntersections, sizeof(ShadeableIntersection) * num_paths, cudaMemcpyDeviceToDevice);
+			checkCUDAError("loadIntersections");
+		}
+		else {
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				);
+			checkCUDAError("trace one bounce");
+
+			if (depth == 0 && iter == 1) {
+				cudaMemcpy(dev_cacheIntersections, dev_intersections, sizeof(ShadeableIntersection) * num_paths, cudaMemcpyDeviceToDevice);
+				checkCUDAError("cacheIntersections");
+			}
+		}
+
+#else
+		// tracing
+		computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+			depth
+			, num_paths
+			, dev_paths
+			, dev_geoms
+			, hst_scene->geoms.size()
+			, dev_intersections
+			);
+		checkCUDAError("trace one bounce");
+#endif // CACHE_FIRST_INTERSECTION
+
+		cudaDeviceSynchronize();
+		if (depth == 0) {
+    		generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
+  		}
+		depth++;
+
+		//thrust ptr
+		thrust::device_ptr<PathSegment> thrust_dev_paths(dev_paths);
+		thrust::device_ptr<ShadeableIntersection> thrust_dev_intersection(dev_intersections);
+
+		auto start = system_clock::now();
+#if MATERIAL_CONTIGUOUS
+		thrust::sort_by_key(thrust_dev_intersection, thrust_dev_intersection + num_paths, thrust_dev_paths);
+#endif // MATERIAL_CONTIGUOUS
+		auto end = system_clock::now();
+		if (iter == 10) {
+			//cout << "sort: " << duration_cast<microseconds>(end - start).count() << endl;
+		}
+
+		// TODO:
+		// --- Shading Stage ---
+		// Shade path segments based on intersections and generate new rays by
+	  // evaluating the BSDF.
+	  // Start off with just a big kernel that handles all the different
+	  // materials you have in the scenefile.
+	  // TODO: compare between directly shading the path segments and shading
+	  // path segments that have been reshuffled to be contiguous in memory.
+
+		start = system_clock::now();
+		shadeSimpleMaterials << <numblocksPathSegmentTracing, blockSize1d >> > (
+			iter,
+			num_paths,
+			dev_intersections,
+			dev_paths,
+			dev_materials,
+			depth,
+			dev_gBuffer
+			);
+		end = system_clock::now();
+		if (iter == 10) {
+			//cout << "shade: " << duration_cast<microseconds>(end - start).count() << endl;
+		}
+
+		start = system_clock::now();
+		thrust::device_ptr<PathSegment> thrust_dev_paths_end = thrust::partition(thrust_dev_paths, thrust_dev_paths + num_paths, shouldContinue());
+		end = system_clock::now();
+		if (iter == 10) {
+			//cout << "partition: " << duration_cast<microseconds>(end - start).count() << endl;
+		}
+		dev_path_end = thrust_dev_paths_end.get();
+		num_paths = dev_path_end - dev_paths;
+		iterationComplete = depth >= traceDepth || num_paths == 0; // TODO: should be based off stream compaction results.
+
+	}
+	auto end = system_clock::now();
+	cout << "shading: " << duration_cast<microseconds>(end - start).count() << endl;
+
+
+	// Assemble this iteration and apply it to the image
+	dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+	finalGather << <numBlocksPixels, blockSize1d >> > (pixelcount, dev_image, dev_paths);
 
-	depth++;
+	///////////////////////////////////////////////////////////////////////////
 
-  shadeSimpleMaterials<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = depth == traceDepth;
+	//setGbufferColor << <numBlocksPixels, blockSize1d >> > (pixelcount, dev_image, dev_gBuffer);
+	start = system_clock::now();
+	if (dParm.denoise) {
+		denoise(pixelcount, cam.resolution, dParm, blockSize2d, blocksPerGrid2d);
 	}
+	end = system_clock::now();
+	cout << "denoise: " << duration_cast<microseconds>(end - start).count() << endl;
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
 
-    ///////////////////////////////////////////////////////////////////////////
 
     // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
     // Otherwise, screenshots are also acceptable.
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+	// Retrieve image from GPU
+	cudaMemcpy(hst_scene->state.image.data(), dev_image,
+		pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    checkCUDAError("pathtrace");
+	checkCUDAError("pathtrace");
 }
 
 // CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
@@ -418,7 +696,7 @@ void showGBuffer(uchar4* pbo) {
             (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
     // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer, 2);
 }
 
 void showImage(uchar4* pbo, int iter) {
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 9e12f440..6ee6a103 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -5,6 +5,6 @@
 
 void pathtraceInit(Scene *scene);
 void pathtraceFree();
-void pathtrace(int frame, int iteration);
+void pathtrace(int frame, int iteration, DenoiseParm dParm);
 void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
diff --git a/src/preview.cpp b/src/preview.cpp
index 3ca27180..0674ec06 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -212,7 +212,11 @@ void drawGui(int windowWidth, int windowHeight) {
 
     ImGui::SliderInt("Iterations", &ui_iterations, 1, startupIterations);
 
-    ImGui::Checkbox("Denoise", &ui_denoise);
+    ImGui::RadioButton("No denoise", &ui_denoise, 0);
+    ImGui::SameLine();
+    ImGui::RadioButton("Naive denoise", &ui_denoise, 1);
+    ImGui::SameLine();
+    ImGui::RadioButton("Edge Avoiding denoise", &ui_denoise, 2);
 
     ImGui::SliderInt("Filter Size", &ui_filterSize, 0, 100);
     ImGui::SliderFloat("Color Weight", &ui_colorWeight, 0.0f, 10.0f);
@@ -225,10 +229,12 @@ void drawGui(int windowWidth, int windowHeight) {
 
     ImGui::Separator();
 
-    if (ImGui::Button("Save image and exit")) {
+    if (ImGui::Button("Save image")) {
         ui_saveAndExit = true;
     }
-
+    if (ImGui::Button("Regenerate")) {
+        ui_regenerate = true;
+    }
     ImGui::End();
 
     ImGui::Render();
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da7e558a..fe018514 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -38,6 +38,7 @@ struct Material {
     float hasRefractive;
     float indexOfRefraction;
     float emittance;
+    glm::vec3 refractionColor;
 };
 
 struct Camera {
@@ -70,6 +71,10 @@ struct PathSegment {
 // 1) color contribution computation
 // 2) BSDF evaluation: generate a new ray
 struct ShadeableIntersection {
+    __host__ __device__ bool operator<(const ShadeableIntersection& s) const {
+        return materialId < s.materialId;
+    }
+
   float t;
   glm::vec3 surfaceNormal;
   int materialId;
@@ -79,4 +84,15 @@ struct ShadeableIntersection {
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
   float t;
+  glm::vec3 normal;
+  glm::vec3 position;
+  glm::vec3 color;
 };
+
+struct DenoiseParm {
+    int denoise;
+    int filterSize;
+    float colorWeight;
+    float normalWeight;
+    float positionWeight;
+};
\ No newline at end of file
diff --git a/src/utilities.h b/src/utilities.h
index abb4f27c..6562cdc8 100644
--- a/src/utilities.h
+++ b/src/utilities.h
@@ -13,6 +13,7 @@
 #define TWO_PI            6.2831853071795864769252867665590057683943f
 #define SQRT_OF_ONE_THIRD 0.5773502691896257645091487805019574556476f
 #define EPSILON           0.00001f
+#define INV_PI            0.31830988618379067154f
 
 namespace utilityCore {
     extern float clamp(float f, float min, float max);