From 64016384e55d9d6dd7bcfef23e947606466d0bd6 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Tue, 19 Oct 2021 20:01:10 -0400
Subject: [PATCH 01/14] normals Done

---
 src/pathtrace.cu   | 21 +++++++++++++++++++--
 src/sceneStructs.h |  1 +
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 23e5f90..ecb0258 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -73,7 +73,7 @@ __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* g
 
     if (x < resolution.x && y < resolution.y) {
         int index = x + (y * resolution.x);
-        float timeToIntersect = gBuffer[index].t * 256.0;
+        float timeToIntersect = gBuffer[index].t * 256.0f;
 
         pbo[index].w = 0;
         pbo[index].x = timeToIntersect;
@@ -82,6 +82,21 @@ __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* g
     }
 }
 
+__global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < resolution.x && y < resolution.y) {
+        int index = x + (y * resolution.x);
+
+        glm::vec3 normal = glm::normalize(gBuffer[index].normals) * 255.0f;
+        pbo[index].w = 0;
+        pbo[index].x = normal.x;
+        pbo[index].y = normal.y;
+        pbo[index].z = normal.z;
+    }
+}
+
 static Scene * hst_scene = NULL;
 static glm::vec3 * dev_image = NULL;
 static Geom * dev_geoms = NULL;
@@ -282,6 +297,7 @@ __global__ void generateGBuffer (
   if (idx < num_paths)
   {
     gBuffer[idx].t = shadeableIntersections[idx].t;
+    gBuffer[idx].normals = shadeableIntersections[idx].surfaceNormal;
   }
 }
 
@@ -418,7 +434,8 @@ void showGBuffer(uchar4* pbo) {
             (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
     // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+    //gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+    gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 }
 
 void showImage(uchar4* pbo, int iter) {
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index da7e558..ecedf09 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -79,4 +79,5 @@ struct ShadeableIntersection {
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
   float t;
+  glm::vec3 normals;
 };

From 0fc7844d76ba36dd2538b26da53fb82813b2d17b Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Tue, 19 Oct 2021 21:01:58 -0400
Subject: [PATCH 02/14] GBufPosition Completed

---
 src/pathtrace.cu   | 22 +++++++++++++++++++---
 src/sceneStructs.h |  3 ++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index ecb0258..087000a 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -89,7 +89,7 @@ __global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBuffer
     if (x < resolution.x && y < resolution.y) {
         int index = x + (y * resolution.x);
 
-        glm::vec3 normal = glm::normalize(gBuffer[index].normals) * 255.0f;
+        glm::vec3 normal = gBuffer[index].normal * 256.0f;
         pbo[index].w = 0;
         pbo[index].x = normal.x;
         pbo[index].y = normal.y;
@@ -97,6 +97,21 @@ __global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBuffer
     }
 }
 
+__global__ void gbufferToPBO_Position(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
+    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    if (x < resolution.x && y < resolution.y) {
+        int index = x + (y * resolution.x);
+
+        glm::vec3 position = glm::abs(gBuffer[index].position) * 20.0f;
+        pbo[index].w = 0;
+        pbo[index].x = position[0];
+        pbo[index].y = position[1];
+        pbo[index].z = position[2];
+    }
+}
+
 static Scene * hst_scene = NULL;
 static glm::vec3 * dev_image = NULL;
 static Geom * dev_geoms = NULL;
@@ -297,7 +312,8 @@ __global__ void generateGBuffer (
   if (idx < num_paths)
   {
     gBuffer[idx].t = shadeableIntersections[idx].t;
-    gBuffer[idx].normals = shadeableIntersections[idx].surfaceNormal;
+    gBuffer[idx].normal = shadeableIntersections[idx].surfaceNormal;
+    gBuffer[idx].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
   }
 }
 
@@ -435,7 +451,7 @@ void showGBuffer(uchar4* pbo) {
 
     // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
     //gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-    gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+    gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 }
 
 void showImage(uchar4* pbo, int iter) {
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index ecedf09..131f211 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -79,5 +79,6 @@ struct ShadeableIntersection {
 // What information might be helpful for guiding a denoising filter?
 struct GBufferPixel {
   float t;
-  glm::vec3 normals;
+  glm::vec3 normal;
+  glm::vec3 position;
 };

From cf49f402ae29db76ff45d9d6eb4c1d09fca27e82 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Wed, 20 Oct 2021 00:57:00 -0400
Subject: [PATCH 03/14] Update

---
 src/main.cpp       |   1 +
 src/pathtrace.cu   | 802 ++++++++++++++++++++++++++-------------------
 src/sceneStructs.h |   1 +
 3 files changed, 464 insertions(+), 340 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 4092ae4..1654770 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -45,6 +45,7 @@ int iteration;
 int width;
 int height;
 
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 087000a..2ae338c 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -18,146 +18,210 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
-void checkCUDAErrorFn(const char *msg, const char *file, int line) {
+void checkCUDAErrorFn(const char* msg, const char* file, int line) {
 #if ERRORCHECK
-    cudaDeviceSynchronize();
-    cudaError_t err = cudaGetLastError();
-    if (cudaSuccess == err) {
-        return;
-    }
-
-    fprintf(stderr, "CUDA error");
-    if (file) {
-        fprintf(stderr, " (%s:%d)", file, line);
-    }
-    fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
+	cudaDeviceSynchronize();
+	cudaError_t err = cudaGetLastError();
+	if (cudaSuccess == err) {
+		return;
+	}
+
+	fprintf(stderr, "CUDA error");
+	if (file) {
+		fprintf(stderr, " (%s:%d)", file, line);
+	}
+	fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err));
 #  ifdef _WIN32
-    getchar();
+	getchar();
 #  endif
-    exit(EXIT_FAILURE);
+	exit(EXIT_FAILURE);
 #endif
 }
 
+
+float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
+
+
+glm::vec2 offsetKernel[25];
+
 __host__ __device__
 thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
-    int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
-    return thrust::default_random_engine(h);
+	int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index);
+	return thrust::default_random_engine(h);
 }
 
 //Kernel that writes the image to the OpenGL PBO directly.
 __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
-        int iter, glm::vec3* image) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        glm::vec3 pix = image[index];
-
-        glm::ivec3 color;
-        color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255);
-        color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255);
-        color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255);
-
-        // Each thread writes one pixel location in the texture (textel)
-        pbo[index].w = 0;
-        pbo[index].x = color.x;
-        pbo[index].y = color.y;
-        pbo[index].z = color.z;
-    }
+	int iter, glm::vec3* image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		glm::vec3 pix = image[index];
+
+		glm::ivec3 color;
+		color.x = glm::clamp((int)((pix.x / iter) * 255.0), 0, 255);
+		color.y = glm::clamp((int)((pix.y / iter) * 255.0), 0, 255);
+		color.z = glm::clamp((int)((pix.z / iter) * 255.0), 0, 255);
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
 __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-        float timeToIntersect = gBuffer[index].t * 256.0f;
-
-        pbo[index].w = 0;
-        pbo[index].x = timeToIntersect;
-        pbo[index].y = timeToIntersect;
-        pbo[index].z = timeToIntersect;
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		float timeToIntersect = gBuffer[index].t * 256.0f;
+
+		pbo[index].w = 0;
+		pbo[index].x = timeToIntersect;
+		pbo[index].y = timeToIntersect;
+		pbo[index].z = timeToIntersect;
+	}
 }
 
 __global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-
-        glm::vec3 normal = gBuffer[index].normal * 256.0f;
-        pbo[index].w = 0;
-        pbo[index].x = normal.x;
-        pbo[index].y = normal.y;
-        pbo[index].z = normal.z;
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+
+		glm::vec3 normal = gBuffer[index].normal * 255.0f;
+		pbo[index].w = 0;
+		pbo[index].x = normal.x;
+		pbo[index].y = normal.y;
+		pbo[index].z = normal.z;
+	}
 }
 
 __global__ void gbufferToPBO_Position(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
-    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-    int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-    if (x < resolution.x && y < resolution.y) {
-        int index = x + (y * resolution.x);
-
-        glm::vec3 position = glm::abs(gBuffer[index].position) * 20.0f;
-        pbo[index].w = 0;
-        pbo[index].x = position[0];
-        pbo[index].y = position[1];
-        pbo[index].z = position[2];
-    }
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+
+		glm::vec3 position = glm::abs(gBuffer[index].position) * 20.0f;
+		pbo[index].w = 0;
+		pbo[index].x = position[0];
+		pbo[index].y = position[1];
+		pbo[index].z = position[2];
+	}
+}
+
+__global__ void gbufferToPBO_Atrous(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer, glm::vec3* TrousImage) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+
+
+		glm::vec3 pix = TrousImage[index];
+		glm::ivec3 color;
+
+
+		color.x = glm::clamp((int)(pix.x  * 255.0), 0, 255);
+		color.y = glm::clamp((int)(pix.y * 255.0), 0, 255);
+		color.z = glm::clamp((int)(pix.z * 255.0), 0, 255);
+;
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
 }
 
-static Scene * hst_scene = NULL;
-static glm::vec3 * dev_image = NULL;
-static Geom * dev_geoms = NULL;
-static Material * dev_materials = NULL;
-static PathSegment * dev_paths = NULL;
-static ShadeableIntersection * dev_intersections = NULL;
+static Scene* hst_scene = NULL;
+static glm::vec3* dev_image = NULL;
+static Geom* dev_geoms = NULL;
+static Material* dev_materials = NULL;
+static PathSegment* dev_paths = NULL;
+static ShadeableIntersection* dev_intersections = NULL;
 static GBufferPixel* dev_gBuffer = NULL;
 // TODO: static variables for device memory, any extra info you need, etc
 // ...
+static float* dev_gausKernel = NULL;
+static glm::vec2* dev_offsetKernel = NULL;
+static glm::vec3* dev_TrousImage = NULL;
+static glm::vec3* dev_IntermediaryImage = NULL;
 
-void pathtraceInit(Scene *scene) {
-    hst_scene = scene;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
+void generateOffsetKern()
+{
+	int index = 0;
+	for (int y = -2; y <= 2; y++)
+	{
+		for (int x = -2; x <= 2; x++)
+		{
+			offsetKernel[index] = glm::vec2(x, y);
+			index++;
+		}
+	}
+}
+
+void pathtraceInit(Scene* scene) {
+	hst_scene = scene;
+	const Camera& cam = hst_scene->state.camera;
+	const int pixelcount = cam.resolution.x * cam.resolution.y;
 
-    cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
-    cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3));
+	cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3));
 
-  	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
+	cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment));
 
-  	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
-  	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom));
+	cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
+	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+
+	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
+	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
 
-  	cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material));
-  	cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
 
-  	cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection));
-  	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+	// TODO: initialize any extra device memeory you need
 
-    cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel));
+	cudaMalloc(&dev_gausKernel, 25 * sizeof(float));
+	cudaMemcpy(dev_gausKernel, gaussianKernel, 25 * sizeof(float), cudaMemcpyHostToDevice);
 
-    // TODO: initialize any extra device memeory you need
+	generateOffsetKern();
+	cudaMalloc(&dev_offsetKernel, 25 * sizeof(glm::vec2));
+	cudaMemcpy(dev_offsetKernel, offsetKernel, 25 * sizeof(glm::vec2), cudaMemcpyHostToDevice);
 
-    checkCUDAError("pathtraceInit");
+	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));
+
+	checkCUDAError("pathtraceInit");
 }
 
 void pathtraceFree() {
-    cudaFree(dev_image);  // no-op if dev_image is null
-  	cudaFree(dev_paths);
-  	cudaFree(dev_geoms);
-  	cudaFree(dev_materials);
-  	cudaFree(dev_intersections);
-    cudaFree(dev_gBuffer);
-    // TODO: clean up any extra device memory you created
-
-    checkCUDAError("pathtraceFree");
+	cudaFree(dev_image);  // no-op if dev_image is null
+	cudaFree(dev_paths);
+	cudaFree(dev_geoms);
+	cudaFree(dev_materials);
+	cudaFree(dev_intersections);
+	cudaFree(dev_gBuffer);
+	// TODO: clean up any extra device memory you created
+
+	cudaFree(dev_gausKernel);
+	cudaFree(dev_offsetKernel);
+	cudaFree(dev_TrousImage);
+	cudaFree(dev_IntermediaryImage);
+	checkCUDAError("pathtraceFree");
 }
 
 /**
@@ -175,292 +239,350 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 	if (x < cam.resolution.x && y < cam.resolution.y) {
 		int index = x + (y * cam.resolution.x);
-		PathSegment & segment = pathSegments[index];
+		PathSegment& segment = pathSegments[index];
 
 		segment.ray.origin = cam.position;
-    segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
+		segment.color = glm::vec3(1.0f, 1.0f, 1.0f);
 
 		segment.ray.direction = glm::normalize(cam.view
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
-			);
+		);
 
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
 }
 
-__global__ void computeIntersections(
-	int depth
-	, int num_paths
-	, PathSegment * pathSegments
-	, Geom * geoms
-	, int geoms_size
-	, ShadeableIntersection * intersections
-	)
+
+__global__ void CopyDataToInterImage(
+	int iter, int num_paths,
+	PathSegment* pathSegments, glm::vec3* dev_interImage)
 {
 	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
 
 	if (path_index < num_paths)
 	{
-		PathSegment pathSegment = pathSegments[path_index];
 
-		float t;
-		glm::vec3 intersect_point;
-		glm::vec3 normal;
-		float t_min = FLT_MAX;
-		int hit_geom_index = -1;
-		bool outside = true;
+		PathSegment iterationPath = pathSegments[path_index];
+		glm::vec3 currColor = iterationPath.color;
+		dev_interImage[iterationPath.pixelIndex] = iterationPath.color / (float)iter;
+	}
+}
+
+	__global__ void GenerateATrousImage(
+		int iter, int num_paths,
+		float* dev_gausKernel, glm::vec2 *dev_offsetKernel,
+		glm::vec3* dev_interImage, glm::vec3 *dev_TrousImage,
+		GBufferPixel * gbuf, const Camera cam
+	)
+	{
+
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (index < num_paths)
+		{
+			//glm::vec3 currColor =glm::vec3(0.0f);
+			glm::vec3 currColor = dev_interImage[index];
+		/*	for (int i = 0; i < 25; i++)
+			{
+
+				float offsetX = dev_offsetKernel[i].x;
+				float offsetY = dev_offsetKernel[i].y;
+				float gausValue = dev_gausKernel[i];
+				int offsetColorIdx = index + (offsetY * cam.resolution.x + offsetX);
+				if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+				{
+					glm::vec3 newColor = dev_interImage[offsetColorIdx];
+					currColor += newColor * dev_gausKernel[i];
+				}
+			}*/
+			dev_TrousImage[index] = currColor;
+		}
 
-		glm::vec3 tmp_intersect;
-		glm::vec3 tmp_normal;
+	}
 
-		// naive parse through global geoms
 
-		for (int i = 0; i < geoms_size; i++)
+	__global__ void computeIntersections(
+		int depth
+		, int num_paths
+		, PathSegment * pathSegments
+		, Geom * geoms
+		, int geoms_size
+		, ShadeableIntersection * intersections
+	)
+	{
+		int path_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (path_index < num_paths)
 		{
-			Geom & geom = geoms[i];
+			PathSegment pathSegment = pathSegments[path_index];
+
+			float t;
+			glm::vec3 intersect_point;
+			glm::vec3 normal;
+			float t_min = FLT_MAX;
+			int hit_geom_index = -1;
+			bool outside = true;
 
-			if (geom.type == CUBE)
+			glm::vec3 tmp_intersect;
+			glm::vec3 tmp_normal;
+
+			// naive parse through global geoms
+
+			for (int i = 0; i < geoms_size; i++)
 			{
-				t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				Geom& geom = geoms[i];
+
+				if (geom.type == CUBE)
+				{
+					t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				}
+				else if (geom.type == SPHERE)
+				{
+					t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				}
+
+				// Compute the minimum t from the intersection tests to determine what
+				// scene geometry object was hit first.
+				if (t > 0.0f && t_min > t)
+				{
+					t_min = t;
+					hit_geom_index = i;
+					intersect_point = tmp_intersect;
+					normal = tmp_normal;
+				}
 			}
-			else if (geom.type == SPHERE)
+
+			if (hit_geom_index == -1)
 			{
-				t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside);
+				intersections[path_index].t = -1.0f;
 			}
-
-			// Compute the minimum t from the intersection tests to determine what
-			// scene geometry object was hit first.
-			if (t > 0.0f && t_min > t)
+			else
 			{
-				t_min = t;
-				hit_geom_index = i;
-				intersect_point = tmp_intersect;
-				normal = tmp_normal;
+				//The ray hits something
+				intersections[path_index].t = t_min;
+				intersections[path_index].materialId = geoms[hit_geom_index].materialid;
+				intersections[path_index].surfaceNormal = normal;
 			}
 		}
+	}
 
-		if (hit_geom_index == -1)
+	__global__ void shadeSimpleMaterials(
+		int iter
+		, int num_paths
+		, ShadeableIntersection * shadeableIntersections
+		, PathSegment * pathSegments
+		, Material * materials
+	)
+	{
+		int idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx < num_paths)
 		{
-			intersections[path_index].t = -1.0f;
+			ShadeableIntersection intersection = shadeableIntersections[idx];
+			PathSegment segment = pathSegments[idx];
+			if (segment.remainingBounces == 0) {
+				return;
+			}
+
+			if (intersection.t > 0.0f) { // if the intersection exists...
+				segment.remainingBounces--;
+				// Set up the RNG
+				thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
+
+				Material material = materials[intersection.materialId];
+				glm::vec3 materialColor = material.color;
+
+				// If the material indicates that the object was a light, "light" the ray
+				if (material.emittance > 0.0f) {
+					segment.color *= (materialColor * material.emittance);
+					segment.remainingBounces = 0;
+				}
+				else {
+					segment.color *= materialColor;
+					glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
+					scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
+				}
+				// If there was no intersection, color the ray black.
+				// Lots of renderers use 4 channel color, RGBA, where A = alpha, often
+				// used for opacity, in which case they can indicate "no opacity".
+				// This can be useful for post-processing and image compositing.
+			}
+			else {
+				segment.color = glm::vec3(0.0f);
+				segment.remainingBounces = 0;
+			}
+
+			pathSegments[idx] = segment;
 		}
-		else
+	}
+
+	__global__ void generateGBuffer(
+		int num_paths,
+		ShadeableIntersection * shadeableIntersections,
+		PathSegment * pathSegments,
+		GBufferPixel * gBuffer) {
+		int idx = blockIdx.x * blockDim.x + threadIdx.x;
+		if (idx < num_paths)
 		{
-			//The ray hits something
-			intersections[path_index].t = t_min;
-			intersections[path_index].materialId = geoms[hit_geom_index].materialid;
-			intersections[path_index].surfaceNormal = normal;
+			int pixelPosition = pathSegments[idx].pixelIndex;
+			gBuffer[pixelPosition].t = shadeableIntersections[idx].t;
+			gBuffer[pixelPosition].normal = shadeableIntersections[idx].surfaceNormal;
+			gBuffer[pixelPosition].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
 		}
 	}
-}
-
-__global__ void shadeSimpleMaterials (
-  int iter
-  , int num_paths
-	, ShadeableIntersection * shadeableIntersections
-	, PathSegment * pathSegments
-	, Material * materials
-	)
-{
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    ShadeableIntersection intersection = shadeableIntersections[idx];
-    PathSegment segment = pathSegments[idx];
-    if (segment.remainingBounces == 0) {
-      return;
-    }
-
-    if (intersection.t > 0.0f) { // if the intersection exists...
-      segment.remainingBounces--;
-      // Set up the RNG
-      thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces);
-
-      Material material = materials[intersection.materialId];
-      glm::vec3 materialColor = material.color;
-
-      // If the material indicates that the object was a light, "light" the ray
-      if (material.emittance > 0.0f) {
-        segment.color *= (materialColor * material.emittance);
-        segment.remainingBounces = 0;
-      }
-      else {
-        segment.color *= materialColor;
-        glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin;
-        scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng);
-      }
-    // If there was no intersection, color the ray black.
-    // Lots of renderers use 4 channel color, RGBA, where A = alpha, often
-    // used for opacity, in which case they can indicate "no opacity".
-    // This can be useful for post-processing and image compositing.
-    } else {
-      segment.color = glm::vec3(0.0f);
-      segment.remainingBounces = 0;
-    }
-
-    pathSegments[idx] = segment;
-  }
-}
-
-__global__ void generateGBuffer (
-  int num_paths,
-  ShadeableIntersection* shadeableIntersections,
-	PathSegment* pathSegments,
-  GBufferPixel* gBuffer) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < num_paths)
-  {
-    gBuffer[idx].t = shadeableIntersections[idx].t;
-    gBuffer[idx].normal = shadeableIntersections[idx].surfaceNormal;
-    gBuffer[idx].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
-  }
-}
 
-// Add the current iteration's output to the overall image
-__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
-{
-	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-	if (index < nPaths)
+	// Add the current iteration's output to the overall image
+	__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths)
 	{
-		PathSegment iterationPath = iterationPaths[index];
-		image[iterationPath.pixelIndex] += iterationPath.color;
+		int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+		if (index < nPaths)
+		{
+			PathSegment iterationPath = iterationPaths[index];
+			image[iterationPath.pixelIndex] += iterationPath.color;
+		}
 	}
-}
 
-/**
- * Wrapper for the __global__ call that sets up the kernel calls and does a ton
- * of memory management
- */
-void pathtrace(int frame, int iter) {
-    const int traceDepth = hst_scene->state.traceDepth;
-    const Camera &cam = hst_scene->state.camera;
-    const int pixelcount = cam.resolution.x * cam.resolution.y;
-
-	// 2D block for generating ray from camera
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-	// 1D block for path tracing
-	const int blockSize1d = 128;
-
-    ///////////////////////////////////////////////////////////////////////////
-
-    // Pathtracing Recap:
-    // * Initialize array of path rays (using rays that come out of the camera)
-    //   * You can pass the Camera object to that kernel.
-    //   * Each path ray must carry at minimum a (ray, color) pair,
-    //   * where color starts as the multiplicative identity, white = (1, 1, 1).
-    //   * This has already been done for you.
-    // * NEW: For the first depth, generate geometry buffers (gbuffers)
-    // * For each depth:
-    //   * Compute an intersection in the scene for each path ray.
-    //     A very naive version of this has been implemented for you, but feel
-    //     free to add more primitives and/or a better algorithm.
-    //     Currently, intersection distance is recorded as a parametric distance,
-    //     t, or a "distance along the ray." t = -1.0 indicates no intersection.
-    //     * Color is attenuated (multiplied) by reflections off of any object
-    //   * Stream compact away all of the terminated paths.
-    //     You may use either your implementation or `thrust::remove_if` or its
-    //     cousins.
-    //     * Note that you can't really use a 2D kernel launch any more - switch
-    //       to 1D.
-    //   * Shade the rays that intersected something or didn't bottom out.
-    //     That is, color the ray by performing a color computation according
-    //     to the shader, then generate a new ray to continue the ray path.
-    //     We recommend just updating the ray's PathSegment in place.
-    //     Note that this step may come before or after stream compaction,
-    //     since some shaders you write may also cause a path to terminate.
-    // * Finally:
-    //     * if not denoising, add this iteration's results to the image
-    //     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
-
-	generateRayFromCamera <<<blocksPerGrid2d, blockSize2d >>>(cam, iter, traceDepth, dev_paths);
-	checkCUDAError("generate camera ray");
-
-	int depth = 0;
-	PathSegment* dev_path_end = dev_paths + pixelcount;
-	int num_paths = dev_path_end - dev_paths;
-
-	// --- PathSegment Tracing Stage ---
-	// Shoot ray into scene, bounce between objects, push shading chunks
-
-  // Empty gbuffer
-  cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
-
-	// clean shading chunks
-	cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+	/**
+	 * Wrapper for the __global__ call that sets up the kernel calls and does a ton
+	 * of memory management
+	 */
+	void pathtrace(int frame, int iter) {
+		const int traceDepth = hst_scene->state.traceDepth;
+		const Camera& cam = hst_scene->state.camera;
+		const int pixelcount = cam.resolution.x * cam.resolution.y;
+
+		// 2D block for generating ray from camera
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+		// 1D block for path tracing
+		const int blockSize1d = 128;
+
+		///////////////////////////////////////////////////////////////////////////
+
+		// Pathtracing Recap:
+		// * Initialize array of path rays (using rays that come out of the camera)
+		//   * You can pass the Camera object to that kernel.
+		//   * Each path ray must carry at minimum a (ray, color) pair,
+		//   * where color starts as the multiplicative identity, white = (1, 1, 1).
+		//   * This has already been done for you.
+		// * NEW: For the first depth, generate geometry buffers (gbuffers)
+		// * For each depth:
+		//   * Compute an intersection in the scene for each path ray.
+		//     A very naive version of this has been implemented for you, but feel
+		//     free to add more primitives and/or a better algorithm.
+		//     Currently, intersection distance is recorded as a parametric distance,
+		//     t, or a "distance along the ray." t = -1.0 indicates no intersection.
+		//     * Color is attenuated (multiplied) by reflections off of any object
+		//   * Stream compact away all of the terminated paths.
+		//     You may use either your implementation or `thrust::remove_if` or its
+		//     cousins.
+		//     * Note that you can't really use a 2D kernel launch any more - switch
+		//       to 1D.
+		//   * Shade the rays that intersected something or didn't bottom out.
+		//     That is, color the ray by performing a color computation according
+		//     to the shader, then generate a new ray to continue the ray path.
+		//     We recommend just updating the ray's PathSegment in place.
+		//     Note that this step may come before or after stream compaction,
+		//     since some shaders you write may also cause a path to terminate.
+		// * Finally:
+		//     * if not denoising, add this iteration's results to the image
+		//     * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl
+
+		generateRayFromCamera << <blocksPerGrid2d, blockSize2d >> > (cam, iter, traceDepth, dev_paths);
+		checkCUDAError("generate camera ray");
+
+		int depth = 0;
+		PathSegment* dev_path_end = dev_paths + pixelcount;
+		int num_paths = dev_path_end - dev_paths;
+
+		// --- PathSegment Tracing Stage ---
+		// Shoot ray into scene, bounce between objects, push shading chunks
+
+	  // Empty gbuffer
+		cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel));
+
+		// clean shading chunks
+		cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection));
+
+		bool iterationComplete = false;
+		while (!iterationComplete) {
+
+			// tracing
+			dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
+			computeIntersections << <numblocksPathSegmentTracing, blockSize1d >> > (
+				depth
+				, num_paths
+				, dev_paths
+				, dev_geoms
+				, hst_scene->geoms.size()
+				, dev_intersections
+				);
+			checkCUDAError("trace one bounce");
+			cudaDeviceSynchronize();
+
+			if (depth == 0) {
+				generateGBuffer << <numblocksPathSegmentTracing, blockSize1d >> > (num_paths, dev_intersections, dev_paths, dev_gBuffer);
+			}
 
-  bool iterationComplete = false;
-	while (!iterationComplete) {
-
-	// tracing
-	dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d;
-	computeIntersections <<<numblocksPathSegmentTracing, blockSize1d>>> (
-		depth
-		, num_paths
-		, dev_paths
-		, dev_geoms
-		, hst_scene->geoms.size()
-		, dev_intersections
-		);
-	checkCUDAError("trace one bounce");
-	cudaDeviceSynchronize();
+			CopyDataToInterImage << <numblocksPathSegmentTracing, blockSize1d >> > (iter, num_paths, dev_paths,  dev_IntermediaryImage);
 
-  if (depth == 0) {
-    generateGBuffer<<<numblocksPathSegmentTracing, blockSize1d>>>(num_paths, dev_intersections, dev_paths, dev_gBuffer);
-  }
+			GenerateATrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
+				dev_IntermediaryImage, dev_TrousImage, dev_gBuffer, cam);
 
-	depth++;
+			depth++;
+			shadeSimpleMaterials << <numblocksPathSegmentTracing, blockSize1d >> > (
+				iter,
+				num_paths,
+				dev_intersections,
+				dev_paths,
+				dev_materials
+				);
+			iterationComplete = depth == traceDepth;
+		}
 
-  shadeSimpleMaterials<<<numblocksPathSegmentTracing, blockSize1d>>> (
-    iter,
-    num_paths,
-    dev_intersections,
-    dev_paths,
-    dev_materials
-  );
-  iterationComplete = depth == traceDepth;
-	}
+		// Assemble this iteration and apply it to the image
+		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
+		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
 
-  // Assemble this iteration and apply it to the image
-  dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
-	finalGather<<<numBlocksPixels, blockSize1d>>>(num_paths, dev_image, dev_paths);
+		///////////////////////////////////////////////////////////////////////////
 
-    ///////////////////////////////////////////////////////////////////////////
+		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
+		// Otherwise, screenshots are also acceptable.
+		// Retrieve image from GPU
+		cudaMemcpy(hst_scene->state.image.data(), dev_image,
+			pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
 
-    // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
-    // Otherwise, screenshots are also acceptable.
-    // Retrieve image from GPU
-    cudaMemcpy(hst_scene->state.image.data(), dev_image,
-            pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost);
+		checkCUDAError("pathtrace");
+	}
 
-    checkCUDAError("pathtrace");
-}
+	// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
+	void showGBuffer(uchar4 * pbo) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
-// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging.
-void showGBuffer(uchar4* pbo) {
-    const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
-
-    // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
-    //gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-    gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-}
 
-void showImage(uchar4* pbo, int iter) {
-const Camera &cam = hst_scene->state.camera;
-    const dim3 blockSize2d(8, 8);
-    const dim3 blocksPerGrid2d(
-            (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
-            (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
+		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		//gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
+	}
+
+	void showImage(uchar4 * pbo, int iter) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
-    // Send results to OpenGL buffer for rendering
-    sendImageToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, iter, dev_image);
-}
+		// Send results to OpenGL buffer for rendering
+		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_image);
+	}
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index 131f211..d455543 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -81,4 +81,5 @@ struct GBufferPixel {
   float t;
   glm::vec3 normal;
   glm::vec3 position;
+  glm::vec3 pathColor;
 };

From 7800325ba0997ea99f39d94ae1d41369dae865c9 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Wed, 20 Oct 2021 13:46:21 -0400
Subject: [PATCH 04/14] normals Fixed

---
 src/pathtrace.cu | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 2ae338c..7775735 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -99,11 +99,16 @@ __global__ void gbufferToPBO_Normals(uchar4* pbo, glm::ivec2 resolution, GBuffer
 	if (x < resolution.x && y < resolution.y) {
 		int index = x + (y * resolution.x);
 
-		glm::vec3 normal = gBuffer[index].normal * 255.0f;
+		glm::vec3 normal = glm::abs(gBuffer[index].normal);
+		glm::ivec3 color;
+		color.x = glm::clamp((int)(normal.x * 255.0), 0, 255);
+		color.y = glm::clamp((int)(normal.y * 255.0), 0, 255);
+		color.z = glm::clamp((int)(normal.z * 255.0), 0, 255);
+
 		pbo[index].w = 0;
-		pbo[index].x = normal.x;
-		pbo[index].y = normal.y;
-		pbo[index].z = normal.z;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
 	}
 }
 
@@ -571,9 +576,9 @@ __global__ void CopyDataToInterImage(
 
 		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
 		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 		//gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
+		//gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
 	}
 
 	void showImage(uchar4 * pbo, int iter) {

From bc0e250ab33170d69ed1dbf9d7070828b55cb343 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Wed, 20 Oct 2021 13:48:47 -0400
Subject: [PATCH 05/14] postition Updated

---
 src/pathtrace.cu | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 7775735..0860f76 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -119,11 +119,16 @@ __global__ void gbufferToPBO_Position(uchar4* pbo, glm::ivec2 resolution, GBuffe
 	if (x < resolution.x && y < resolution.y) {
 		int index = x + (y * resolution.x);
 
-		glm::vec3 position = glm::abs(gBuffer[index].position) * 20.0f;
+		glm::vec3 position = glm::abs(gBuffer[index].position) ;
+		glm::ivec3 color;
+		color.x = glm::clamp((int)(position.x * 20.0), 0, 255);
+		color.y = glm::clamp((int)(position.y * 20.0), 0, 255);
+		color.z = glm::clamp((int)(position.z * 20.0), 0, 255);
+
 		pbo[index].w = 0;
-		pbo[index].x = position[0];
-		pbo[index].y = position[1];
-		pbo[index].z = position[2];
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
 	}
 }
 
@@ -576,8 +581,8 @@ __global__ void CopyDataToInterImage(
 
 		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
 		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		//gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 		//gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
 	}
 

From 0d7a05e1524a668b8a98fc0145f3d6e44944a9dc Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Wed, 20 Oct 2021 16:40:13 -0400
Subject: [PATCH 06/14] Gaussian BLur Fixed

---
 src/pathtrace.cu | 84 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 24 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 0860f76..cfd0e67 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -144,9 +144,9 @@ __global__ void gbufferToPBO_Atrous(uchar4* pbo, glm::ivec2 resolution, GBufferP
 		glm::ivec3 color;
 
 
-		color.x = glm::clamp((int)(pix.x  * 255.0), 0, 255);
-		color.y = glm::clamp((int)(pix.y * 255.0), 0, 255);
-		color.z = glm::clamp((int)(pix.z * 255.0), 0, 255);
+		color.x = glm::clamp((int)(pix.x/10  * 255.0), 0, 255);
+		color.y = glm::clamp((int)(pix.y/10  * 255.0), 0, 255);
+		color.z = glm::clamp((int)(pix.z/10  * 255.0), 0, 255);
 ;
 		pbo[index].w = 0;
 		pbo[index].x = color.x;
@@ -167,7 +167,7 @@ static GBufferPixel* dev_gBuffer = NULL;
 static float* dev_gausKernel = NULL;
 static glm::vec2* dev_offsetKernel = NULL;
 static glm::vec3* dev_TrousImage = NULL;
-static glm::vec3* dev_IntermediaryImage = NULL;
+//static glm::vec3* dev_IntermediaryImage = NULL;
 
 void generateOffsetKern()
 {
@@ -213,7 +213,7 @@ void pathtraceInit(Scene* scene) {
 	cudaMemcpy(dev_offsetKernel, offsetKernel, 25 * sizeof(glm::vec2), cudaMemcpyHostToDevice);
 
 	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
-	cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));
+	/*cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));*/
 
 	checkCUDAError("pathtraceInit");
 }
@@ -230,7 +230,7 @@ void pathtraceFree() {
 	cudaFree(dev_gausKernel);
 	cudaFree(dev_offsetKernel);
 	cudaFree(dev_TrousImage);
-	cudaFree(dev_IntermediaryImage);
+	/*cudaFree(dev_IntermediaryImage);*/
 	checkCUDAError("pathtraceFree");
 }
 
@@ -275,16 +275,16 @@ __global__ void CopyDataToInterImage(
 	{
 
 		PathSegment iterationPath = pathSegments[path_index];
-		glm::vec3 currColor = iterationPath.color;
-		dev_interImage[iterationPath.pixelIndex] = iterationPath.color / (float)iter;
+		glm::vec3 currColor = dev_interImage[iterationPath.pixelIndex] + iterationPath.color;
+		dev_interImage[iterationPath.pixelIndex] += iterationPath.color ;
 	}
 }
 
-	__global__ void GenerateATrousImage(
-		int iter, int num_paths,
+	__global__ void GenerateGaussianBlur(
+		int num_paths,
 		float* dev_gausKernel, glm::vec2 *dev_offsetKernel,
-		glm::vec3* dev_interImage, glm::vec3 *dev_TrousImage,
-		GBufferPixel * gbuf, const Camera cam
+		glm::vec3* dev_colorImage, glm::vec3 *dev_TrousImage,
+		const Camera cam
 	)
 	{
 
@@ -292,9 +292,9 @@ __global__ void CopyDataToInterImage(
 
 		if (index < num_paths)
 		{
-			//glm::vec3 currColor =glm::vec3(0.0f);
-			glm::vec3 currColor = dev_interImage[index];
-		/*	for (int i = 0; i < 25; i++)
+			glm::vec3 currColor =glm::vec3(0.0f);
+			//glm::vec3 currColor = dev_colorImage[index];
+			for (int i = 0; i < 25; i++)
 			{
 
 				float offsetX = dev_offsetKernel[i].x;
@@ -303,15 +303,47 @@ __global__ void CopyDataToInterImage(
 				int offsetColorIdx = index + (offsetY * cam.resolution.x + offsetX);
 				if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
 				{
-					glm::vec3 newColor = dev_interImage[offsetColorIdx];
+					glm::vec3 newColor = dev_colorImage[offsetColorIdx];
 					currColor += newColor * dev_gausKernel[i];
 				}
-			}*/
+			}
 			dev_TrousImage[index] = currColor;
 		}
 
 	}
 
+	//__global__ void GenerateGaussianBlur(
+	//	int num_paths,
+	//	float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+	//	glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+	//	GBufferPixel* gbuf, const Camera cam
+	//)
+	//{
+
+	//	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	//	if (index < num_paths)
+	//	{
+	//		glm::vec3 currColor = glm::vec3(0.0f);
+	//		//glm::vec3 currColor = dev_colorImage[index];
+	//		for (int i = 0; i < 25; i++)
+	//		{
+
+	//			float offsetX = dev_offsetKernel[i].x;
+	//			float offsetY = dev_offsetKernel[i].y;
+	//			float gausValue = dev_gausKernel[i];
+	//			int offsetColorIdx = index + (offsetY * cam.resolution.x + offsetX);
+	//			if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+	//			{
+	//				glm::vec3 newColor = dev_colorImage[offsetColorIdx];
+	//				currColor += newColor * dev_gausKernel[i];
+	//			}
+	//		}
+	//		dev_TrousImage[index] = currColor;
+	//	}
+
+	//}
+
 
 	__global__ void computeIntersections(
 		int depth
@@ -538,11 +570,12 @@ __global__ void CopyDataToInterImage(
 			if (depth == 0) {
 				generateGBuffer << <numblocksPathSegmentTracing, blockSize1d >> > (num_paths, dev_intersections, dev_paths, dev_gBuffer);
 			}
+		/*	if (depth == traceDepth) {
+				CopyDataToInterImage << <numblocksPathSegmentTracing, blockSize1d >> > (iter, num_paths, dev_paths, dev_IntermediaryImage);
 
-			CopyDataToInterImage << <numblocksPathSegmentTracing, blockSize1d >> > (iter, num_paths, dev_paths,  dev_IntermediaryImage);
-
-			GenerateATrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
-				dev_IntermediaryImage, dev_TrousImage, dev_gBuffer, cam);
+				GenerateATrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
+					dev_IntermediaryImage, dev_TrousImage, dev_gBuffer, cam);
+			}*/
 
 			depth++;
 			shadeSimpleMaterials << <numblocksPathSegmentTracing, blockSize1d >> > (
@@ -554,11 +587,14 @@ __global__ void CopyDataToInterImage(
 				);
 			iterationComplete = depth == traceDepth;
 		}
-
 		// Assemble this iteration and apply it to the image
 		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
+		GenerateGaussianBlur << <numBlocksPixels, blockSize1d >> > (num_paths, dev_gausKernel, dev_offsetKernel,
+			dev_image, dev_TrousImage, cam);
 
+		//GenerateATrousImage << <numBlocksPixels, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
+		//	dev_image, dev_TrousImage, dev_gBuffer, cam);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
@@ -582,8 +618,8 @@ __global__ void CopyDataToInterImage(
 		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
 		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		//gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
+		//gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
+		gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
 	}
 
 	void showImage(uchar4 * pbo, int iter) {

From 6a26cb493cec01d1e80054157aa6c36ff8e2ed9c Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Wed, 20 Oct 2021 17:50:05 -0400
Subject: [PATCH 07/14] Gaussian Updated

---
 src/pathtrace.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index cfd0e67..2d572ef 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -258,7 +258,6 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			- cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f)
 			- cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f)
 		);
-
 		segment.pixelIndex = index;
 		segment.remainingBounces = traceDepth;
 	}
@@ -296,15 +295,24 @@ __global__ void CopyDataToInterImage(
 			//glm::vec3 currColor = dev_colorImage[index];
 			for (int i = 0; i < 25; i++)
 			{
+				int index2D_y = index / cam.resolution.x;
+				int index2D_x = (int)(index % cam.resolution.x);
+
+				int offsetX = dev_offsetKernel[i].x;
+				int offsetY = dev_offsetKernel[i].y;
+
+				int finalValue_X = index2D_x + offsetX;
+				int finalValue_Y = index2D_y + offsetY;
 
-				float offsetX = dev_offsetKernel[i].x;
-				float offsetY = dev_offsetKernel[i].y;
-				float gausValue = dev_gausKernel[i];
-				int offsetColorIdx = index + (offsetY * cam.resolution.x + offsetX);
-				if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+				if (finalValue_X >= 0 && finalValue_X <= (cam.resolution.x - 1) && finalValue_Y >= 0 && finalValue_Y <= (cam.resolution.y - 1))
 				{
-					glm::vec3 newColor = dev_colorImage[offsetColorIdx];
-					currColor += newColor * dev_gausKernel[i];
+					float gausValue = dev_gausKernel[i];
+					int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
+					if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+					{
+						glm::vec3 newColor = dev_colorImage[offsetColorIdx];
+						currColor += newColor * dev_gausKernel[i];
+					}
 				}
 			}
 			dev_TrousImage[index] = currColor;

From 42c725df2e118cd0a0367cf662a980b6f1e07a83 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Thu, 21 Oct 2021 02:01:07 -0400
Subject: [PATCH 08/14] A lot of changes

---
 src/main.cpp       |  31 ++++++-
 src/pathtrace.cu   | 197 ++++++++++++++++++++++++++++++++-------------
 src/pathtrace.h    |   2 +-
 src/sceneStructs.h |   1 -
 4 files changed, 169 insertions(+), 62 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 1654770..aa92675 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -45,6 +45,30 @@ int iteration;
 int width;
 int height;
 
+void FilterCreation(int filter_size, float *kernel)
+{
+    // initialising standard deviation to 1.0
+    double sigma = 1.0;
+    double r, s = 2.0 * sigma * sigma;
+    // sum is for normalization
+    double sum = 0.0;
+    int itr = 0;
+    // generating filter_sizexfilter_size kernel
+    for (int x = -filter_size/2; x <= filter_size/2; x++) {
+        for (int y = -filter_size/2; y <= filter_size/2; y++) {
+            r = sqrt(x * x + y * y);
+            kernel[itr] = (exp(-(r * r) / s)) / (PI * s);
+            sum += kernel[itr];
+            itr++;
+        }
+    }
+
+    // normalising the Kernel
+    for (int i = 0; i < filter_size * filter_size; ++i)
+    {
+        kernel[i] /= sum;
+    }
+}
 
 //-------------------------------
 //-------------MAIN--------------
@@ -57,7 +81,6 @@ int main(int argc, char** argv) {
         printf("Usage: %s SCENEFILE.txt\n", argv[0]);
         return 1;
     }
-
     const char *sceneFile = argv[1];
 
     // Load scene file
@@ -151,8 +174,12 @@ void runCuda() {
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
     if (iteration == 0) {
+        int filter_size = glm::sqrt(ui_filterSize);
+        filter_size = filter_size % 2 == 0 ? filter_size + 1 : filter_size;
+        float *gKernel = new float[filter_size * filter_size];
+        FilterCreation(filter_size, gKernel);
         pathtraceFree();
-        pathtraceInit(scene);
+        pathtraceInit(scene, ui_colorWeight, ui_normalWeight, ui_positionWeight, gKernel, filter_size);
     }
 
     uchar4 *pbo_dptr = NULL;
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 2d572ef..02b7da6 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -39,14 +39,14 @@ void checkCUDAErrorFn(const char* msg, const char* file, int line) {
 }
 
 
-float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
-0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
-0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
-0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
-0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
+//float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
+//0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+//0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
+//0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+//0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
 
 
-glm::vec2 offsetKernel[25];
+//glm::vec2 offsetKernel[25];
 
 __host__ __device__
 thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) {
@@ -167,22 +167,28 @@ static GBufferPixel* dev_gBuffer = NULL;
 static float* dev_gausKernel = NULL;
 static glm::vec2* dev_offsetKernel = NULL;
 static glm::vec3* dev_TrousImage = NULL;
+
+static float *dev_ui_colorWeight;
+static float *dev_ui_normalWeight ;
+static float *dev_ui_positionWeight;
+static float *dev_ui_filterSize;
 //static glm::vec3* dev_IntermediaryImage = NULL;
 
-void generateOffsetKern()
+void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
 {
 	int index = 0;
-	for (int y = -2; y <= 2; y++)
+	filterSize = filterSize % 2 == 0 ? filterSize - 1 : filterSize;
+	for (int y = -filterSize/2; y <= filterSize/2; y++)
 	{
-		for (int x = -2; x <= 2; x++)
+		for (int x = -filterSize/2; x <= filterSize/2; x++)
 		{
-			offsetKernel[index] = glm::vec2(x, y);
+			offsetKernel.push_back(glm::vec2(x, y));
 			index++;
 		}
 	}
 }
 
-void pathtraceInit(Scene* scene) {
+void pathtraceInit(Scene* scene, float ui_colorWeight, float ui_normalWeight, float ui_positionWeight, float *gausKernel, float filterSize) {
 	hst_scene = scene;
 	const Camera& cam = hst_scene->state.camera;
 	const int pixelcount = cam.resolution.x * cam.resolution.y;
@@ -205,14 +211,39 @@ void pathtraceInit(Scene* scene) {
 
 	// TODO: initialize any extra device memeory you need
 
-	cudaMalloc(&dev_gausKernel, 25 * sizeof(float));
-	cudaMemcpy(dev_gausKernel, gaussianKernel, 25 * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_gausKernel, filterSize * filterSize * sizeof(float));
+	cudaMemcpy(dev_gausKernel, gausKernel, filterSize * filterSize * sizeof(float), cudaMemcpyHostToDevice);
+
+	for (int i = 0; i < filterSize * filterSize; i++)
+	{
+		std::cout << gausKernel[i];
+	}
+
+	vector< glm::vec2> offKern;
 
-	generateOffsetKern();
-	cudaMalloc(&dev_offsetKernel, 25 * sizeof(glm::vec2));
-	cudaMemcpy(dev_offsetKernel, offsetKernel, 25 * sizeof(glm::vec2), cudaMemcpyHostToDevice);
+	generateOffsetKern(filterSize, offKern);
+	cudaMalloc(&dev_offsetKernel, filterSize * filterSize * sizeof(glm::vec2));
+	cudaMemcpy(dev_offsetKernel, offKern.data(), filterSize * filterSize * sizeof(glm::vec2), cudaMemcpyHostToDevice);
 
 	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
+
+	cudaMalloc(&dev_ui_colorWeight, sizeof(float));
+	cudaMalloc(&dev_ui_normalWeight, sizeof(float));
+	cudaMalloc(&dev_ui_positionWeight, sizeof(float));
+	cudaMalloc(&dev_ui_filterSize, sizeof(float));
+
+
+
+	//cudaMemset(dev_ui_colorWeight, ui_colorWeight, sizeof(float));
+	//cudaMemset(dev_ui_normalWeight, ui_normalWeight, sizeof(float));
+	//cudaMemset(dev_ui_positionWeight, ui_positionWeight, sizeof(float));
+
+	cudaMemcpy(dev_ui_colorWeight, &ui_colorWeight, sizeof(float), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_ui_normalWeight, &ui_normalWeight, sizeof(float), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_ui_positionWeight, &ui_normalWeight, sizeof(float), cudaMemcpyHostToDevice);
+	cudaMemcpy(dev_ui_filterSize, &filterSize, sizeof(float), cudaMemcpyHostToDevice);
+
+
 	/*cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));*/
 
 	checkCUDAError("pathtraceInit");
@@ -230,7 +261,11 @@ void pathtraceFree() {
 	cudaFree(dev_gausKernel);
 	cudaFree(dev_offsetKernel);
 	cudaFree(dev_TrousImage);
-	/*cudaFree(dev_IntermediaryImage);*/
+
+	cudaFree(dev_ui_colorWeight);
+	cudaFree(dev_ui_normalWeight);
+	cudaFree(dev_ui_positionWeight);
+	cudaFree(dev_ui_filterSize);
 	checkCUDAError("pathtraceFree");
 }
 
@@ -320,37 +355,89 @@ __global__ void CopyDataToInterImage(
 
 	}
 
-	//__global__ void GenerateGaussianBlur(
-	//	int num_paths,
-	//	float* dev_gausKernel, glm::vec2* dev_offsetKernel,
-	//	glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
-	//	GBufferPixel* gbuf, const Camera cam
-	//)
-	//{
-
-	//	int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	//	if (index < num_paths)
-	//	{
-	//		glm::vec3 currColor = glm::vec3(0.0f);
-	//		//glm::vec3 currColor = dev_colorImage[index];
-	//		for (int i = 0; i < 25; i++)
-	//		{
-
-	//			float offsetX = dev_offsetKernel[i].x;
-	//			float offsetY = dev_offsetKernel[i].y;
-	//			float gausValue = dev_gausKernel[i];
-	//			int offsetColorIdx = index + (offsetY * cam.resolution.x + offsetX);
-	//			if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
-	//			{
-	//				glm::vec3 newColor = dev_colorImage[offsetColorIdx];
-	//				currColor += newColor * dev_gausKernel[i];
-	//			}
-	//		}
-	//		dev_TrousImage[index] = currColor;
-	//	}
-
-	//}
+	__global__ void GenerateAtrousImage(
+		int num_paths, int filterSize,
+		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+		GBufferPixel* gbuf, const Camera cam, float* dev_ui_colorWeight,
+		float* dev_ui_normalWeight,float* dev_ui_positionWeight
+	)
+	{
+
+		int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+		if (index < num_paths)
+		{
+			glm::vec3 sum = glm::vec3(0.0f);
+			glm::vec3 cval = dev_colorImage[index];
+			glm::vec3 nval = gbuf[index].normal;
+			glm::vec3 pval = gbuf[index].position;
+
+			float cphi = dev_ui_colorWeight[0] * dev_ui_colorWeight[0];
+			float nphi = dev_ui_normalWeight[0] * dev_ui_normalWeight[0];
+			float pphi = dev_ui_positionWeight[0] * dev_ui_positionWeight[0];
+
+			float cum_w = 0.0f;
+			for (int stepIter = 0; stepIter < 1; stepIter++)
+			{
+				for (int i = 0; i < 25; i++)
+				{
+					int stepWidth = 1 << stepIter;
+					// Calculate Offseted Index
+					int index2D_y = index / cam.resolution.x;
+					int index2D_x = (int)(index % cam.resolution.x);
+
+					int offsetX = dev_offsetKernel[i].x;
+					int offsetY = dev_offsetKernel[i].y;
+
+					int finalValue_X = index2D_x + offsetX * stepWidth; // Final Offset Values
+					int finalValue_Y = index2D_y + offsetY * stepWidth; // Final Offset Values
+
+					if (finalValue_X >= 0 && finalValue_X <= (cam.resolution.x - 1) && finalValue_Y >= 0 && finalValue_Y <= (cam.resolution.y - 1))
+					{
+						int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+						{
+							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
+							glm::vec3 t = cval - ctmp;
+							float dist2 = glm::dot(t, t);
+							if (dist2 != 0.0f)
+							{
+								dist2 = dist2;
+							}
+							float newVal = glm::exp(-1 * (dist2) / cphi);
+							float c_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
+							t = nval - ntmp;
+							dist2 = glm::max(glm::dot(t, t)/ (stepWidth * stepWidth), 0.0f);
+							newVal = glm::exp(-1 * (dist2) / nphi );
+							float n_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
+							t = pval - ptmp;
+							dist2 = glm::dot(t, t);
+							newVal = glm::exp(-1 * (dist2) / pphi);
+							float p_w = glm::min(newVal, 1.0f);
+							float weight = c_w * n_w * p_w;
+
+
+							if (weight < 0.9f && weight >0.1f)
+							{
+								c_w = c_w;
+							}
+
+							sum += ctmp * weight * dev_gausKernel[i];
+							cum_w += weight * dev_gausKernel[i];
+
+						}
+					}
+				}
+			}
+				dev_TrousImage[index] = sum / cum_w;
+		}
+
+	}
 
 
 	__global__ void computeIntersections(
@@ -578,12 +665,6 @@ __global__ void CopyDataToInterImage(
 			if (depth == 0) {
 				generateGBuffer << <numblocksPathSegmentTracing, blockSize1d >> > (num_paths, dev_intersections, dev_paths, dev_gBuffer);
 			}
-		/*	if (depth == traceDepth) {
-				CopyDataToInterImage << <numblocksPathSegmentTracing, blockSize1d >> > (iter, num_paths, dev_paths, dev_IntermediaryImage);
-
-				GenerateATrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
-					dev_IntermediaryImage, dev_TrousImage, dev_gBuffer, cam);
-			}*/
 
 			depth++;
 			shadeSimpleMaterials << <numblocksPathSegmentTracing, blockSize1d >> > (
@@ -598,11 +679,11 @@ __global__ void CopyDataToInterImage(
 		// Assemble this iteration and apply it to the image
 		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
-		GenerateGaussianBlur << <numBlocksPixels, blockSize1d >> > (num_paths, dev_gausKernel, dev_offsetKernel,
-			dev_image, dev_TrousImage, cam);
+		//GenerateGaussianBlur << <numBlocksPixels, blockSize1d >> > (num_paths, dev_gausKernel, dev_offsetKernel,
+		//	dev_image, dev_TrousImage, cam);
 
-		//GenerateATrousImage << <numBlocksPixels, blockSize1d >> > (depth, num_paths, dev_gausKernel, dev_offsetKernel,
-		//	dev_image, dev_TrousImage, dev_gBuffer, cam);
+		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, dev_ui_filterSize[0],dev_gausKernel, dev_offsetKernel,
+			dev_image, dev_TrousImage, dev_gBuffer, cam, dev_ui_colorWeight, dev_ui_normalWeight, dev_ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 9e12f44..28721c7 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -3,7 +3,7 @@
 #include <vector>
 #include "scene.h"
 
-void pathtraceInit(Scene *scene);
+void pathtraceInit(Scene *scene, float a, float b, float c, float* gausKernel, float filterSize);
 void pathtraceFree();
 void pathtrace(int frame, int iteration);
 void showGBuffer(uchar4 *pbo);
diff --git a/src/sceneStructs.h b/src/sceneStructs.h
index d455543..131f211 100644
--- a/src/sceneStructs.h
+++ b/src/sceneStructs.h
@@ -81,5 +81,4 @@ struct GBufferPixel {
   float t;
   glm::vec3 normal;
   glm::vec3 position;
-  glm::vec3 pathColor;
 };

From 208126cf5a3e4cd925e7869eb283380d5acc0146 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Thu, 21 Oct 2021 02:34:47 -0400
Subject: [PATCH 09/14] A trou Implemented

---
 src/main.cpp     |  4 +++-
 src/pathtrace.cu | 51 +++++++++++++++++++-----------------------------
 2 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index aa92675..3561ee6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -53,10 +53,12 @@ void FilterCreation(int filter_size, float *kernel)
     // sum is for normalization
     double sum = 0.0;
     int itr = 0;
+
+    int center = filter_size / 2.0f;
     // generating filter_sizexfilter_size kernel
     for (int x = -filter_size/2; x <= filter_size/2; x++) {
         for (int y = -filter_size/2; y <= filter_size/2; y++) {
-            r = sqrt(x * x + y * y);
+            r = sqrt( x * x + y * y );
             kernel[itr] = (exp(-(r * r) / s)) / (PI * s);
             sum += kernel[itr];
             itr++;
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 02b7da6..020cbfd 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -144,9 +144,9 @@ __global__ void gbufferToPBO_Atrous(uchar4* pbo, glm::ivec2 resolution, GBufferP
 		glm::ivec3 color;
 
 
-		color.x = glm::clamp((int)(pix.x/10  * 255.0), 0, 255);
-		color.y = glm::clamp((int)(pix.y/10  * 255.0), 0, 255);
-		color.z = glm::clamp((int)(pix.z/10  * 255.0), 0, 255);
+		color.x = glm::clamp((int)(pix.x/2  * 255.0), 0, 255);
+		color.y = glm::clamp((int)(pix.y/2  * 255.0), 0, 255);
+		color.z = glm::clamp((int)(pix.z/2  * 255.0), 0, 255);
 ;
 		pbo[index].w = 0;
 		pbo[index].x = color.x;
@@ -168,10 +168,10 @@ static float* dev_gausKernel = NULL;
 static glm::vec2* dev_offsetKernel = NULL;
 static glm::vec3* dev_TrousImage = NULL;
 
-static float *dev_ui_colorWeight;
-static float *dev_ui_normalWeight ;
-static float *dev_ui_positionWeight;
-static float *dev_ui_filterSize;
+static float ui_colorWeight = 0.0f;
+static float ui_normalWeight = 0.0f;
+static float ui_positionWeight = 0.0f;
+static float ui_filterSize = 0.0f;
 //static glm::vec3* dev_IntermediaryImage = NULL;
 
 void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
@@ -188,7 +188,7 @@ void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
 	}
 }
 
-void pathtraceInit(Scene* scene, float ui_colorWeight, float ui_normalWeight, float ui_positionWeight, float *gausKernel, float filterSize) {
+void pathtraceInit(Scene* scene, float a_ui_colorWeight, float a_ui_normalWeight, float a_ui_positionWeight, float *gausKernel, float filterSize) {
 	hst_scene = scene;
 	const Camera& cam = hst_scene->state.camera;
 	const int pixelcount = cam.resolution.x * cam.resolution.y;
@@ -227,21 +227,16 @@ void pathtraceInit(Scene* scene, float ui_colorWeight, float ui_normalWeight, fl
 
 	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
 
-	cudaMalloc(&dev_ui_colorWeight, sizeof(float));
-	cudaMalloc(&dev_ui_normalWeight, sizeof(float));
-	cudaMalloc(&dev_ui_positionWeight, sizeof(float));
-	cudaMalloc(&dev_ui_filterSize, sizeof(float));
-
 
 
 	//cudaMemset(dev_ui_colorWeight, ui_colorWeight, sizeof(float));
 	//cudaMemset(dev_ui_normalWeight, ui_normalWeight, sizeof(float));
 	//cudaMemset(dev_ui_positionWeight, ui_positionWeight, sizeof(float));
 
-	cudaMemcpy(dev_ui_colorWeight, &ui_colorWeight, sizeof(float), cudaMemcpyHostToDevice);
-	cudaMemcpy(dev_ui_normalWeight, &ui_normalWeight, sizeof(float), cudaMemcpyHostToDevice);
-	cudaMemcpy(dev_ui_positionWeight, &ui_normalWeight, sizeof(float), cudaMemcpyHostToDevice);
-	cudaMemcpy(dev_ui_filterSize, &filterSize, sizeof(float), cudaMemcpyHostToDevice);
+	ui_colorWeight = a_ui_colorWeight;
+	ui_normalWeight = a_ui_normalWeight;
+	ui_positionWeight = a_ui_positionWeight;
+	ui_filterSize = filterSize;
 
 
 	/*cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));*/
@@ -261,11 +256,6 @@ void pathtraceFree() {
 	cudaFree(dev_gausKernel);
 	cudaFree(dev_offsetKernel);
 	cudaFree(dev_TrousImage);
-
-	cudaFree(dev_ui_colorWeight);
-	cudaFree(dev_ui_normalWeight);
-	cudaFree(dev_ui_positionWeight);
-	cudaFree(dev_ui_filterSize);
 	checkCUDAError("pathtraceFree");
 }
 
@@ -359,8 +349,8 @@ __global__ void CopyDataToInterImage(
 		int num_paths, int filterSize,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
-		GBufferPixel* gbuf, const Camera cam, float* dev_ui_colorWeight,
-		float* dev_ui_normalWeight,float* dev_ui_positionWeight
+		GBufferPixel* gbuf, const Camera cam, float ui_colorWeight,
+		float ui_normalWeight,float ui_positionWeight
 	)
 	{
 
@@ -373,12 +363,12 @@ __global__ void CopyDataToInterImage(
 			glm::vec3 nval = gbuf[index].normal;
 			glm::vec3 pval = gbuf[index].position;
 
-			float cphi = dev_ui_colorWeight[0] * dev_ui_colorWeight[0];
-			float nphi = dev_ui_normalWeight[0] * dev_ui_normalWeight[0];
-			float pphi = dev_ui_positionWeight[0] * dev_ui_positionWeight[0];
+			float cphi = ui_colorWeight * ui_colorWeight;
+			float nphi = ui_normalWeight * ui_normalWeight;
+			float pphi = ui_positionWeight * ui_positionWeight;
 
 			float cum_w = 0.0f;
-			for (int stepIter = 0; stepIter < 1; stepIter++)
+			for (int stepIter = 0; stepIter < 10; stepIter++)
 			{
 				for (int i = 0; i < 25; i++)
 				{
@@ -681,9 +671,8 @@ __global__ void CopyDataToInterImage(
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
 		//GenerateGaussianBlur << <numBlocksPixels, blockSize1d >> > (num_paths, dev_gausKernel, dev_offsetKernel,
 		//	dev_image, dev_TrousImage, cam);
-
-		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, dev_ui_filterSize[0],dev_gausKernel, dev_offsetKernel,
-			dev_image, dev_TrousImage, dev_gBuffer, cam, dev_ui_colorWeight, dev_ui_normalWeight, dev_ui_positionWeight);
+		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
+			dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.

From 06d27198c88022b06b866aa136ab173a0957b578 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Thu, 21 Oct 2021 11:52:45 -0400
Subject: [PATCH 10/14] Updates

---
 src/main.cpp     |  20 +++--
 src/pathtrace.cu | 188 +++++++++++++++++++++++++++++++++--------------
 src/pathtrace.h  |   2 +
 3 files changed, 148 insertions(+), 62 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 3561ee6..e6a49c5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -48,18 +48,16 @@ int height;
 void FilterCreation(int filter_size, float *kernel)
 {
     // initialising standard deviation to 1.0
-    double sigma = 1.0;
-    double r, s = 2.0 * sigma * sigma;
+    float sigma = 1.0;
+    float r, s = 2.0 * sigma * sigma;
     // sum is for normalization
-    double sum = 0.0;
+    float sum = 0.0;
     int itr = 0;
-
-    int center = filter_size / 2.0f;
     // generating filter_sizexfilter_size kernel
     for (int x = -filter_size/2; x <= filter_size/2; x++) {
         for (int y = -filter_size/2; y <= filter_size/2; y++) {
-            r = sqrt( x * x + y * y );
-            kernel[itr] = (exp(-(r * r) / s)) / (PI * s);
+            r =  x * x + y * y ;
+            kernel[itr] = (glm::exp(-(r) / s)) / (PI * s);
             sum += kernel[itr];
             itr++;
         }
@@ -72,6 +70,7 @@ void FilterCreation(int filter_size, float *kernel)
     }
 }
 
+
 //-------------------------------
 //-------------MAIN--------------
 //-------------------------------
@@ -197,7 +196,12 @@ void runCuda() {
 
     if (ui_showGbuffer) {
       showGBuffer(pbo_dptr);
-    } else {
+    }
+    else if (ui_denoise)
+    {
+        showDenoise(pbo_dptr, iteration);
+    }
+    else {
       showImage(pbo_dptr, iteration);
     }
 
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 020cbfd..37a62fc 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -132,28 +132,6 @@ __global__ void gbufferToPBO_Position(uchar4* pbo, glm::ivec2 resolution, GBuffe
 	}
 }
 
-__global__ void gbufferToPBO_Atrous(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer, glm::vec3* TrousImage) {
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-	if (x < resolution.x && y < resolution.y) {
-		int index = x + (y * resolution.x);
-
-
-		glm::vec3 pix = TrousImage[index];
-		glm::ivec3 color;
-
-
-		color.x = glm::clamp((int)(pix.x/2  * 255.0), 0, 255);
-		color.y = glm::clamp((int)(pix.y/2  * 255.0), 0, 255);
-		color.z = glm::clamp((int)(pix.z/2  * 255.0), 0, 255);
-;
-		pbo[index].w = 0;
-		pbo[index].x = color.x;
-		pbo[index].y = color.y;
-		pbo[index].z = color.z;
-	}
-}
 
 static Scene* hst_scene = NULL;
 static glm::vec3* dev_image = NULL;
@@ -214,11 +192,6 @@ void pathtraceInit(Scene* scene, float a_ui_colorWeight, float a_ui_normalWeight
 	cudaMalloc(&dev_gausKernel, filterSize * filterSize * sizeof(float));
 	cudaMemcpy(dev_gausKernel, gausKernel, filterSize * filterSize * sizeof(float), cudaMemcpyHostToDevice);
 
-	for (int i = 0; i < filterSize * filterSize; i++)
-	{
-		std::cout << gausKernel[i];
-	}
-
 	vector< glm::vec2> offKern;
 
 	generateOffsetKern(filterSize, offKern);
@@ -289,23 +262,8 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 }
 
 
-__global__ void CopyDataToInterImage(
-	int iter, int num_paths,
-	PathSegment* pathSegments, glm::vec3* dev_interImage)
-{
-	int path_index = blockIdx.x * blockDim.x + threadIdx.x;
-
-	if (path_index < num_paths)
-	{
-
-		PathSegment iterationPath = pathSegments[path_index];
-		glm::vec3 currColor = dev_interImage[iterationPath.pixelIndex] + iterationPath.color;
-		dev_interImage[iterationPath.pixelIndex] += iterationPath.color ;
-	}
-}
-
 	__global__ void GenerateGaussianBlur(
-		int num_paths,
+		int num_paths, int filterSize,
 		float* dev_gausKernel, glm::vec2 *dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3 *dev_TrousImage,
 		const Camera cam
@@ -318,7 +276,7 @@ __global__ void CopyDataToInterImage(
 		{
 			glm::vec3 currColor =glm::vec3(0.0f);
 			//glm::vec3 currColor = dev_colorImage[index];
-			for (int i = 0; i < 25; i++)
+			for (int i = 0; i < filterSize ; i++)
 			{
 				int index2D_y = index / cam.resolution.x;
 				int index2D_x = (int)(index % cam.resolution.x);
@@ -345,6 +303,47 @@ __global__ void CopyDataToInterImage(
 
 	}
 
+	__global__ void GenerateGaussianBlur2(
+		int num_paths, int filterSize,
+		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+		const Camera cam
+	)
+	{
+
+		int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+		if (x < cam.resolution.x && y < cam.resolution.y)
+		{
+			int index = x + (y * cam.resolution.x);
+			glm::vec3 currColor = glm::vec3(0.0f);
+			//glm::vec3 currColor = dev_colorImage[index];
+			for (int i = 0; i < filterSize; i++)
+			{
+
+				int offsetX = dev_offsetKernel[i].x;
+				int offsetY = dev_offsetKernel[i].y;
+
+				int finalValue_X = x + offsetX;
+				int finalValue_Y = y + offsetY;
+
+				finalValue_X = glm::clamp(finalValue_X, 0, cam.resolution.x - 1);
+				finalValue_Y = glm::clamp(finalValue_Y, 0, cam.resolution.y - 1);
+					float gausValue = dev_gausKernel[i];
+					int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
+				/*	if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+					{*/
+						glm::vec3 newColor = dev_colorImage[offsetColorIdx];
+						currColor += newColor * dev_gausKernel[i];
+					/*}*/
+				
+			}
+			dev_TrousImage[index] = currColor;
+		}
+
+	}
+
 	__global__ void GenerateAtrousImage(
 		int num_paths, int filterSize,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
@@ -370,7 +369,7 @@ __global__ void CopyDataToInterImage(
 			float cum_w = 0.0f;
 			for (int stepIter = 0; stepIter < 10; stepIter++)
 			{
-				for (int i = 0; i < 25; i++)
+				for (int i = 0; i < filterSize; i++)
 				{
 					int stepWidth = 1 << stepIter;
 					// Calculate Offseted Index
@@ -428,7 +427,77 @@ __global__ void CopyDataToInterImage(
 		}
 
 	}
+	
+	__global__ void GenerateAtrousImage2(
+		int num_paths, int filterSize,
+		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
+		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
+		GBufferPixel* gbuf, const Camera cam, float ui_colorWeight,
+		float ui_normalWeight, float ui_positionWeight
+	)
+	{
 
+		int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+		if (x < cam.resolution.x && y < cam.resolution.y)
+		{
+			int index = x + (y * cam.resolution.x);
+			glm::vec3 sum = glm::vec3(0.0f);
+			glm::vec3 cval = dev_colorImage[index];
+			glm::vec3 nval = gbuf[index].normal;
+			glm::vec3 pval = gbuf[index].position;
+
+			float cphi = ui_colorWeight * ui_colorWeight;
+			float nphi = ui_normalWeight * ui_normalWeight;
+			float pphi = ui_positionWeight * ui_positionWeight;
+
+			float cum_w = 0.0f;
+			for (int stepIter = 0; stepIter < 10; stepIter++)
+			{
+				for (int i = 0; i < 25; i++)
+				{
+					int stepWidth = 1 << stepIter;
+					// Calculate Offseted Index
+
+					int offsetX = dev_offsetKernel[i].x;
+					int offsetY = dev_offsetKernel[i].y;
+
+					int finalValue_X = glm::clamp ((float)x + (float)offsetX * stepWidth, 0.0f, (float)cam.resolution.x - 1.0f);
+					int finalValue_Y = glm::clamp((float)y + (float)offsetY * stepWidth, 0.0f, (float)cam.resolution.y - 1.0f);
+
+						int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+						{
+							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
+							glm::vec3 t = cval - ctmp;
+							float dist2 = glm::dot(t, t);
+							float newVal = glm::exp(-1 * (dist2) / cphi);
+							float c_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
+							t = nval - ntmp;
+							dist2 = glm::max(glm::dot(t, t) / (stepWidth * stepWidth), 0.0f);
+							newVal = glm::exp(-1 * (dist2) / nphi);
+							float n_w = glm::min(newVal, 1.0f);
+
+							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
+							t = pval - ptmp;
+							dist2 = glm::dot(t, t);
+							newVal = glm::exp(-1 * (dist2) / pphi);
+							float p_w = glm::min(newVal, 1.0f);
+							float weight = c_w * n_w * p_w;
+							sum += ctmp * weight * dev_gausKernel[i];
+							cum_w += weight * dev_gausKernel[i];
+
+						}
+					
+				}
+			}
+			dev_TrousImage[index] = sum / cum_w;
+		}
+
+	}
 
 	__global__ void computeIntersections(
 		int depth
@@ -553,9 +622,9 @@ __global__ void CopyDataToInterImage(
 		if (idx < num_paths)
 		{
 			int pixelPosition = pathSegments[idx].pixelIndex;
-			gBuffer[pixelPosition].t = shadeableIntersections[idx].t;
-			gBuffer[pixelPosition].normal = shadeableIntersections[idx].surfaceNormal;
-			gBuffer[pixelPosition].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
+			gBuffer[idx].t = shadeableIntersections[idx].t;
+			gBuffer[idx].normal = shadeableIntersections[idx].surfaceNormal;
+			gBuffer[idx].position = getPointOnRay(pathSegments[idx].ray, shadeableIntersections[idx].t);
 		}
 	}
 
@@ -669,10 +738,10 @@ __global__ void CopyDataToInterImage(
 		// Assemble this iteration and apply it to the image
 		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
-		//GenerateGaussianBlur << <numBlocksPixels, blockSize1d >> > (num_paths, dev_gausKernel, dev_offsetKernel,
-		//	dev_image, dev_TrousImage, cam);
-		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
-			dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
+		GenerateGaussianBlur2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize,dev_gausKernel, dev_offsetKernel,
+			dev_image, dev_TrousImage, cam);
+//		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
+//			dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
@@ -696,8 +765,7 @@ __global__ void CopyDataToInterImage(
 		// CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization
 		//gbufferToPBO<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 		//gbufferToPBO_Normals<<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		//gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
-		gbufferToPBO_Atrous << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, dev_gBuffer, dev_TrousImage);
+		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 	}
 
 	void showImage(uchar4 * pbo, int iter) {
@@ -710,3 +778,15 @@ __global__ void CopyDataToInterImage(
 		// Send results to OpenGL buffer for rendering
 		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_image);
 	}
+
+
+	void showDenoise(uchar4* pbo, int iter) {
+		const Camera& cam = hst_scene->state.camera;
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x,
+			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
+
+		// Send results to OpenGL buffer for rendering
+		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
+	}
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 28721c7..5ae7dfc 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -8,3 +8,5 @@ void pathtraceFree();
 void pathtrace(int frame, int iteration);
 void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
+void showDenoise(uchar4 *pbo, int iter);
+

From 03e87bc447e5175c3a1e9c2d22966df2b6700611 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Thu, 21 Oct 2021 18:06:39 -0400
Subject: [PATCH 11/14] Fixed Filter Size

---
 src/pathtrace.cu | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 37a62fc..2a07675 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -317,14 +317,13 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		if (x < cam.resolution.x && y < cam.resolution.y)
 		{
 			int index = x + (y * cam.resolution.x);
+			glm::vec3 color = dev_colorImage[index];
 			glm::vec3 currColor = glm::vec3(0.0f);
 			//glm::vec3 currColor = dev_colorImage[index];
-			for (int i = 0; i < filterSize; i++)
+			for (int i = 0; i < filterSize * filterSize; i++)
 			{
-
 				int offsetX = dev_offsetKernel[i].x;
 				int offsetY = dev_offsetKernel[i].y;
-
 				int finalValue_X = x + offsetX;
 				int finalValue_Y = y + offsetY;
 
@@ -367,9 +366,9 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			float pphi = ui_positionWeight * ui_positionWeight;
 
 			float cum_w = 0.0f;
-			for (int stepIter = 0; stepIter < 10; stepIter++)
+			for (int stepIter = 0; stepIter < 1; stepIter++)
 			{
-				for (int i = 0; i < filterSize; i++)
+				for (int i = 0; i < filterSize * filterSize; i++)
 				{
 					int stepWidth = 1 << stepIter;
 					// Calculate Offseted Index
@@ -453,9 +452,9 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			float pphi = ui_positionWeight * ui_positionWeight;
 
 			float cum_w = 0.0f;
-			for (int stepIter = 0; stepIter < 10; stepIter++)
+			for (int stepIter = 0; stepIter < 1; stepIter++)
 			{
-				for (int i = 0; i < 25; i++)
+				for (int i = 0; i < filterSize *filterSize; i++)
 				{
 					int stepWidth = 1 << stepIter;
 					// Calculate Offseted Index
@@ -472,12 +471,12 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
 							glm::vec3 t = cval - ctmp;
 							float dist2 = glm::dot(t, t);
-							float newVal = glm::exp(-1 * (dist2) / cphi);
-							float c_w = glm::min(newVal, 1.0f);
+							float newVal = glm::exp(-(dist2) / cphi);
+							float c_w = glm::min(newVal, 1.f);
 
 							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
 							t = nval - ntmp;
-							dist2 = glm::max(glm::dot(t, t) / (stepWidth * stepWidth), 0.0f);
+							dist2 = glm::max(glm::dot(t, t) / (stepWidth * stepWidth), 0.f);
 							newVal = glm::exp(-1 * (dist2) / nphi);
 							float n_w = glm::min(newVal, 1.0f);
 
@@ -485,7 +484,7 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 							t = pval - ptmp;
 							dist2 = glm::dot(t, t);
 							newVal = glm::exp(-1 * (dist2) / pphi);
-							float p_w = glm::min(newVal, 1.0f);
+							float p_w = glm::min(newVal, 1.f);
 							float weight = c_w * n_w * p_w;
 							sum += ctmp * weight * dev_gausKernel[i];
 							cum_w += weight * dev_gausKernel[i];
@@ -495,6 +494,7 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 				}
 			}
 			dev_TrousImage[index] = sum / cum_w;
+			//dev_TrousImage[index] = cval;
 		}
 
 	}
@@ -738,10 +738,12 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		// Assemble this iteration and apply it to the image
 		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
-		GenerateGaussianBlur2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize,dev_gausKernel, dev_offsetKernel,
-			dev_image, dev_TrousImage, cam);
-//		GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
-//			dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
+//		GenerateGaussianBlur2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize,dev_gausKernel, dev_offsetKernel,
+//			dev_image, dev_TrousImage, cam);
+		//GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
+		//	dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
+		 		GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
+		dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.

From e6449827032fbdd515ba014044bda527e3f6f12e Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Fri, 22 Oct 2021 12:36:08 -0400
Subject: [PATCH 12/14] Fix Atrous Update

---
 src/main.cpp     |  19 +++--
 src/pathtrace.cu | 204 ++++++++++++++++++++++++++++-------------------
 src/pathtrace.h  |   3 +-
 src/preview.cpp  |   2 +-
 4 files changed, 138 insertions(+), 90 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index e6a49c5..d25d5cb 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,6 +29,7 @@ float ui_colorWeight = 0.45f;
 float ui_normalWeight = 0.35f;
 float ui_positionWeight = 0.2f;
 bool ui_saveAndExit = false;
+bool imageDenoised = false;
 
 static bool camchanged = true;
 static float dtheta = 0, dphi = 0;
@@ -175,12 +176,11 @@ void runCuda() {
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer
 
     if (iteration == 0) {
-        int filter_size = glm::sqrt(ui_filterSize);
-        filter_size = filter_size % 2 == 0 ? filter_size + 1 : filter_size;
-        float *gKernel = new float[filter_size * filter_size];
-        FilterCreation(filter_size, gKernel);
+        float *gKernel = new float[5 * 5];
+        FilterCreation(5, gKernel);
         pathtraceFree();
-        pathtraceInit(scene, ui_colorWeight, ui_normalWeight, ui_positionWeight, gKernel, filter_size);
+        pathtraceInit(scene, gKernel);
+        imageDenoised = false;
     }
 
     uchar4 *pbo_dptr = NULL;
@@ -191,14 +191,19 @@ void runCuda() {
 
         // execute the kernel
         int frame = 0;
-        pathtrace(frame, iteration);
+        pathtrace(frame, iteration); 
     }
 
     if (ui_showGbuffer) {
       showGBuffer(pbo_dptr);
     }
-    else if (ui_denoise)
+    else if (ui_denoise && iteration == ui_iterations)
     {
+        if (!imageDenoised)
+        {
+            imageDenoised = DenoiseImage(renderState->camera.resolution.x, renderState->camera.resolution.y, iteration, ui_filterSize,
+                ui_colorWeight, ui_normalWeight, ui_positionWeight);
+        }
         showDenoise(pbo_dptr, iteration);
     }
     else {
diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 2a07675..57f4ecd 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -39,11 +39,11 @@ void checkCUDAErrorFn(const char* msg, const char* file, int line) {
 }
 
 
-//float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
-//0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
-//0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
-//0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
-//0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
+float gaussianKernel[25] = { 0.003765, 0.015019, 0.023792, 0.015019, 0.003765,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.023792, 0.094907, 0.150342, 0.094907, 0.023792,
+0.015019, 0.059912, 0.094907, 0.059912, 0.015019,
+0.003765, 0.015019, 0.023792, 0.015019, 0.003765, };
 
 
 //glm::vec2 offsetKernel[25];
@@ -77,6 +77,29 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
 	}
 }
 
+//Kernel that writes the image to the OpenGL PBO directly.
+__global__ void sendImageToPBOCopy(uchar4* pbo, glm::ivec2 resolution,
+	int iter, glm::vec3* image) {
+	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
+	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+	if (x < resolution.x && y < resolution.y) {
+		int index = x + (y * resolution.x);
+		glm::vec3 pix = image[index];
+
+		glm::ivec3 color;
+		color.x = glm::clamp((int)((pix.x / iter) * 255.0), 0, 255);
+		color.y = glm::clamp((int)((pix.y / iter) * 255.0), 0, 255);
+		color.z = glm::clamp((int)((pix.z / iter) * 255.0), 0, 255);
+
+		// Each thread writes one pixel location in the texture (textel)
+		pbo[index].w = 0;
+		pbo[index].x = color.x;
+		pbo[index].y = color.y;
+		pbo[index].z = color.z;
+	}
+}
+
 __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -145,11 +168,8 @@ static GBufferPixel* dev_gBuffer = NULL;
 static float* dev_gausKernel = NULL;
 static glm::vec2* dev_offsetKernel = NULL;
 static glm::vec3* dev_TrousImage = NULL;
+static glm::vec3* dev_pingPongImage = NULL;
 
-static float ui_colorWeight = 0.0f;
-static float ui_normalWeight = 0.0f;
-static float ui_positionWeight = 0.0f;
-static float ui_filterSize = 0.0f;
 //static glm::vec3* dev_IntermediaryImage = NULL;
 
 void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
@@ -166,7 +186,7 @@ void generateOffsetKern(int filterSize, vector<glm::vec2> &offsetKernel)
 	}
 }
 
-void pathtraceInit(Scene* scene, float a_ui_colorWeight, float a_ui_normalWeight, float a_ui_positionWeight, float *gausKernel, float filterSize) {
+void pathtraceInit(Scene* scene,float *gausKernel) {
 	hst_scene = scene;
 	const Camera& cam = hst_scene->state.camera;
 	const int pixelcount = cam.resolution.x * cam.resolution.y;
@@ -189,31 +209,19 @@ void pathtraceInit(Scene* scene, float a_ui_colorWeight, float a_ui_normalWeight
 
 	// TODO: initialize any extra device memeory you need
 
-	cudaMalloc(&dev_gausKernel, filterSize * filterSize * sizeof(float));
-	cudaMemcpy(dev_gausKernel, gausKernel, filterSize * filterSize * sizeof(float), cudaMemcpyHostToDevice);
+	cudaMalloc(&dev_gausKernel,  25 * sizeof(float));
+	cudaMemcpy(dev_gausKernel, gaussianKernel, 25 * sizeof(float), cudaMemcpyHostToDevice);
 
 	vector< glm::vec2> offKern;
 
-	generateOffsetKern(filterSize, offKern);
-	cudaMalloc(&dev_offsetKernel, filterSize * filterSize * sizeof(glm::vec2));
-	cudaMemcpy(dev_offsetKernel, offKern.data(), filterSize * filterSize * sizeof(glm::vec2), cudaMemcpyHostToDevice);
+	generateOffsetKern(5, offKern);
+	cudaMalloc(&dev_offsetKernel, 25 * sizeof(glm::vec2));
+	cudaMemcpy(dev_offsetKernel, offKern.data(), 25 * sizeof(glm::vec2), cudaMemcpyHostToDevice);
 
 	cudaMalloc(&dev_TrousImage, pixelcount * sizeof(glm::vec3));
+	cudaMalloc(&dev_pingPongImage, pixelcount * sizeof(glm::vec3));
 
 
-
-	//cudaMemset(dev_ui_colorWeight, ui_colorWeight, sizeof(float));
-	//cudaMemset(dev_ui_normalWeight, ui_normalWeight, sizeof(float));
-	//cudaMemset(dev_ui_positionWeight, ui_positionWeight, sizeof(float));
-
-	ui_colorWeight = a_ui_colorWeight;
-	ui_normalWeight = a_ui_normalWeight;
-	ui_positionWeight = a_ui_positionWeight;
-	ui_filterSize = filterSize;
-
-
-	/*cudaMalloc(&dev_IntermediaryImage, pixelcount * sizeof(glm::vec3));*/
-
 	checkCUDAError("pathtraceInit");
 }
 
@@ -229,6 +237,7 @@ void pathtraceFree() {
 	cudaFree(dev_gausKernel);
 	cudaFree(dev_offsetKernel);
 	cudaFree(dev_TrousImage);
+	cudaFree(dev_pingPongImage);
 	checkCUDAError("pathtraceFree");
 }
 
@@ -344,17 +353,17 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 	}
 
 	__global__ void GenerateAtrousImage(
-		int num_paths, int filterSize,
+		int pixelCount, int stepWidth,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
-		GBufferPixel* gbuf, const Camera cam, float ui_colorWeight,
-		float ui_normalWeight,float ui_positionWeight
+		GBufferPixel* gbuf, int resolutionX, int resolutionY, float ui_colorWeight,
+		float ui_normalWeight, float ui_positionWeight
 	)
 	{
 
 		int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-		if (index < num_paths)
+		if (index < pixelCount)
 		{
 			glm::vec3 sum = glm::vec3(0.0f);
 			glm::vec3 cval = dev_colorImage[index];
@@ -366,14 +375,11 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			float pphi = ui_positionWeight * ui_positionWeight;
 
 			float cum_w = 0.0f;
-			for (int stepIter = 0; stepIter < 1; stepIter++)
-			{
-				for (int i = 0; i < filterSize * filterSize; i++)
+				for (int i = 0; i < 25; i++)
 				{
-					int stepWidth = 1 << stepIter;
 					// Calculate Offseted Index
-					int index2D_y = index / cam.resolution.x;
-					int index2D_x = (int)(index % cam.resolution.x);
+					int index2D_y = index / resolutionX ;
+					int index2D_x = (int)(index % resolutionX);
 
 					int offsetX = dev_offsetKernel[i].x;
 					int offsetY = dev_offsetKernel[i].y;
@@ -381,57 +387,50 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 					int finalValue_X = index2D_x + offsetX * stepWidth; // Final Offset Values
 					int finalValue_Y = index2D_y + offsetY * stepWidth; // Final Offset Values
 
-					if (finalValue_X >= 0 && finalValue_X <= (cam.resolution.x - 1) && finalValue_Y >= 0 && finalValue_Y <= (cam.resolution.y - 1))
+					//finalValue_X = glm::clamp((float)finalValue_X, 0.0f, (float)resolutionX - 1.0f);
+					//finalValue_Y = glm::clamp((float)finalValue_Y, 0.0f, (float)resolutionY - 1.0f);
+
+					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
 					{
-						int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
-						if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
 						{
 							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
 							glm::vec3 t = cval - ctmp;
 							float dist2 = glm::dot(t, t);
-							if (dist2 != 0.0f)
-							{
-								dist2 = dist2;
-							}
 							float newVal = glm::exp(-1 * (dist2) / cphi);
-							float c_w = glm::min(newVal, 1.0f);
+							float c_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 
 							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
 							t = nval - ntmp;
 							dist2 = glm::max(glm::dot(t, t)/ (stepWidth * stepWidth), 0.0f);
 							newVal = glm::exp(-1 * (dist2) / nphi );
-							float n_w = glm::min(newVal, 1.0f);
+							float n_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 
 							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
 							t = pval - ptmp;
 							dist2 = glm::dot(t, t);
 							newVal = glm::exp(-1 * (dist2) / pphi);
-							float p_w = glm::min(newVal, 1.0f);
+							float p_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 							float weight = c_w * n_w * p_w;
 
-
-							if (weight < 0.9f && weight >0.1f)
-							{
-								c_w = c_w;
-							}
-
 							sum += ctmp * weight * dev_gausKernel[i];
 							cum_w += weight * dev_gausKernel[i];
 
 						}
 					}
 				}
-			}
+			
 				dev_TrousImage[index] = sum / cum_w;
 		}
 
 	}
 	
 	__global__ void GenerateAtrousImage2(
-		int num_paths, int filterSize,
+		int pixelCount, int stepWidth,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
-		GBufferPixel* gbuf, const Camera cam, float ui_colorWeight,
+		GBufferPixel* gbuf, int resolutionX, int resolutionY, float ui_colorWeight,
 		float ui_normalWeight, float ui_positionWeight
 	)
 	{
@@ -439,62 +438,64 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
 
-		if (x < cam.resolution.x && y < cam.resolution.y)
+		if (x < resolutionX && y < resolutionY)
 		{
-			int index = x + (y * cam.resolution.x);
+			int index = x + (y * resolutionX);
 			glm::vec3 sum = glm::vec3(0.0f);
 			glm::vec3 cval = dev_colorImage[index];
 			glm::vec3 nval = gbuf[index].normal;
 			glm::vec3 pval = gbuf[index].position;
 
-			float cphi = ui_colorWeight * ui_colorWeight;
-			float nphi = ui_normalWeight * ui_normalWeight;
-			float pphi = ui_positionWeight * ui_positionWeight;
+			float cphi = ui_colorWeight ;
+			float nphi = ui_normalWeight ;
+			float pphi = ui_positionWeight ;
 
 			float cum_w = 0.0f;
-			for (int stepIter = 0; stepIter < 1; stepIter++)
-			{
-				for (int i = 0; i < filterSize *filterSize; i++)
+				for (int i = 0; i < 5 * 5; i++)
 				{
-					int stepWidth = 1 << stepIter;
 					// Calculate Offseted Index
 
 					int offsetX = dev_offsetKernel[i].x;
 					int offsetY = dev_offsetKernel[i].y;
 
-					int finalValue_X = glm::clamp ((float)x + (float)offsetX * stepWidth, 0.0f, (float)cam.resolution.x - 1.0f);
-					int finalValue_Y = glm::clamp((float)y + (float)offsetY * stepWidth, 0.0f, (float)cam.resolution.y - 1.0f);
-
-						int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
-						if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+					int finalValue_X  = x + offsetX * stepWidth;
+					int finalValue_Y = y + offsetY * stepWidth;
+					//int finalValue_X = glm::clamp ((float)x + (float)offsetX * stepWidth, 0.0f, (float)resolutionX - 1.0f);
+					//int finalValue_Y = glm::clamp((float)y + (float)offsetY * stepWidth, 0.0f, (float)resolutionY - 1.0f);
+					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
+					{
+						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+						if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
 						{
 							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
 							glm::vec3 t = cval - ctmp;
-							float dist2 = glm::dot(t, t);
+							float dist2 = glm::length(t) * glm::length(t);
 							float newVal = glm::exp(-(dist2) / cphi);
-							float c_w = glm::min(newVal, 1.f);
+							float c_w = glm::min(newVal, 1.0f);
 
 							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
 							t = nval - ntmp;
-							dist2 = glm::max(glm::dot(t, t) / (stepWidth * stepWidth), 0.f);
+							dist2 = glm::max( (glm::length(t) * glm::length(t)) / (stepWidth * stepWidth), 0.f);
 							newVal = glm::exp(-1 * (dist2) / nphi);
 							float n_w = glm::min(newVal, 1.0f);
 
 							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
 							t = pval - ptmp;
-							dist2 = glm::dot(t, t);
+							dist2 = glm::length(t) * glm::length(t);
 							newVal = glm::exp(-1 * (dist2) / pphi);
-							float p_w = glm::min(newVal, 1.f);
+							float p_w = glm::min(newVal, 1.0f);
+
+
 							float weight = c_w * n_w * p_w;
 							sum += ctmp * weight * dev_gausKernel[i];
 							cum_w += weight * dev_gausKernel[i];
 
 						}
+					}
 					
 				}
-			}
+			
 			dev_TrousImage[index] = sum / cum_w;
-			//dev_TrousImage[index] = cval;
 		}
 
 	}
@@ -742,8 +743,8 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 //			dev_image, dev_TrousImage, cam);
 		//GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
 		//	dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
-		 		GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
-		dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
+		// 		GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
+		//dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
@@ -770,6 +771,47 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 	}
 
+	__global__ void GeneratePingPongImage(int pixelCount, glm::vec3* devImage, glm::vec3* pingPongImage)
+	{
+		int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+		if (index < pixelCount)
+		{
+			pingPongImage[index] = devImage[index];
+		}
+	}
+
+
+	bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, int colWeight, int norWeight, int posWeight)
+	{
+		int pixelCount = resolutionX * resolutionY;
+		const int blockSize1d = 128;
+		dim3 numblocksPathSegmentTracing = (pixelCount + blockSize1d - 1) / blockSize1d;
+
+		// 2D block for generating ray from camera
+		const dim3 blockSize2d(8, 8);
+		const dim3 blocksPerGrid2d(
+			(resolutionX + blockSize2d.x - 1) / blockSize2d.x,
+			(resolutionY + blockSize2d.y - 1) / blockSize2d.y);
+
+		int stepWidth = 1;
+		int blur_iterations = ceil(glm::log2((filterSize) / 4.f));
+		GeneratePingPongImage << < numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, dev_image, dev_pingPongImage);
+		for (int i = 1; i < blur_iterations; i ++)
+		{
+
+			// 1D block for path tracing
+			GenerateAtrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, stepWidth, dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colWeight, norWeight, posWeight);
+			stepWidth *= 2;
+			//GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (pixelCount, i,dev_gausKernel, dev_offsetKernel,
+			//	dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colWeight, norWeight, posWeight);
+			swap(dev_pingPongImage, dev_TrousImage);
+		}
+		swap(dev_pingPongImage, dev_TrousImage); // Getting the data back to dev_TrousImage
+		cudaDeviceSynchronize();
+		return true;
+	}
+
 	void showImage(uchar4 * pbo, int iter) {
 		const Camera& cam = hst_scene->state.camera;
 		const dim3 blockSize2d(8, 8);
@@ -790,5 +832,5 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 		// Send results to OpenGL buffer for rendering
-		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
+		sendImageToPBOCopy << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
 	}
diff --git a/src/pathtrace.h b/src/pathtrace.h
index 5ae7dfc..f9a7366 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -3,10 +3,11 @@
 #include <vector>
 #include "scene.h"
 
-void pathtraceInit(Scene *scene, float a, float b, float c, float* gausKernel, float filterSize);
+void pathtraceInit(Scene *scene, float* gausKernel);
 void pathtraceFree();
 void pathtrace(int frame, int iteration);
 void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
 void showDenoise(uchar4 *pbo, int iter);
 
+bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, int colWeight, int norWeight, int posWeight);
\ No newline at end of file
diff --git a/src/preview.cpp b/src/preview.cpp
index 3ca2718..994f657 100644
--- a/src/preview.cpp
+++ b/src/preview.cpp
@@ -214,7 +214,7 @@ void drawGui(int windowWidth, int windowHeight) {
 
     ImGui::Checkbox("Denoise", &ui_denoise);
 
-    ImGui::SliderInt("Filter Size", &ui_filterSize, 0, 100);
+    ImGui::SliderInt("Filter Size", &ui_filterSize, 0, 300);
     ImGui::SliderFloat("Color Weight", &ui_colorWeight, 0.0f, 10.0f);
     ImGui::SliderFloat("Normal Weight", &ui_normalWeight, 0.0f, 10.0f);
     ImGui::SliderFloat("Position Weight", &ui_positionWeight, 0.0f, 10.0f);

From b8163e48aeb0a5a607fb78b68fb54576f8a08aad Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Fri, 22 Oct 2021 23:04:53 -0400
Subject: [PATCH 13/14] refactored code

---
 src/pathtrace.cu | 189 ++++++++++++++++++++++-------------------------
 src/pathtrace.h  |   2 +-
 2 files changed, 90 insertions(+), 101 deletions(-)

diff --git a/src/pathtrace.cu b/src/pathtrace.cu
index 57f4ecd..c295723 100644
--- a/src/pathtrace.cu
+++ b/src/pathtrace.cu
@@ -16,6 +16,14 @@
 
 #define ERRORCHECK 1
 
+
+//Flags for Denoising
+#define SimpleGaussianDenoise 0
+#define ATrous1 1
+#define ATrous2 0
+
+
+
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 void checkCUDAErrorFn(const char* msg, const char* file, int line) {
@@ -77,29 +85,6 @@ __global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution,
 	}
 }
 
-//Kernel that writes the image to the OpenGL PBO directly.
-__global__ void sendImageToPBOCopy(uchar4* pbo, glm::ivec2 resolution,
-	int iter, glm::vec3* image) {
-	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-	if (x < resolution.x && y < resolution.y) {
-		int index = x + (y * resolution.x);
-		glm::vec3 pix = image[index];
-
-		glm::ivec3 color;
-		color.x = glm::clamp((int)((pix.x / iter) * 255.0), 0, 255);
-		color.y = glm::clamp((int)((pix.y / iter) * 255.0), 0, 255);
-		color.z = glm::clamp((int)((pix.z / iter) * 255.0), 0, 255);
-
-		// Each thread writes one pixel location in the texture (textel)
-		pbo[index].w = 0;
-		pbo[index].x = color.x;
-		pbo[index].y = color.y;
-		pbo[index].z = color.z;
-	}
-}
-
 __global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) {
 	int x = (blockIdx.x * blockDim.x) + threadIdx.x;
 	int y = (blockIdx.y * blockDim.y) + threadIdx.y;
@@ -272,35 +257,35 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 
 	__global__ void GenerateGaussianBlur(
-		int num_paths, int filterSize,
+		int pixelCount, int stepWidth,
 		float* dev_gausKernel, glm::vec2 *dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3 *dev_TrousImage,
-		const Camera cam
+		int resolutionX, int resolutionY
 	)
 	{
 
 		int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-		if (index < num_paths)
+		if (index < pixelCount)
 		{
 			glm::vec3 currColor =glm::vec3(0.0f);
 			//glm::vec3 currColor = dev_colorImage[index];
-			for (int i = 0; i < filterSize ; i++)
+			for (int i = 0; i < 25 ; i++)
 			{
-				int index2D_y = index / cam.resolution.x;
-				int index2D_x = (int)(index % cam.resolution.x);
+				int index2D_y = index / resolutionX;
+				int index2D_x = (int)(index % resolutionX);
 
 				int offsetX = dev_offsetKernel[i].x;
 				int offsetY = dev_offsetKernel[i].y;
 
-				int finalValue_X = index2D_x + offsetX;
-				int finalValue_Y = index2D_y + offsetY;
+				int finalValue_X = index2D_x + offsetX * stepWidth;
+				int finalValue_Y = index2D_y + offsetY * stepWidth;
 
-				if (finalValue_X >= 0 && finalValue_X <= (cam.resolution.x - 1) && finalValue_Y >= 0 && finalValue_Y <= (cam.resolution.y - 1))
+				if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
 				{
 					float gausValue = dev_gausKernel[i];
-					int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
-					if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
+					int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
+					if (offsetColorIdx >= 0 && offsetColorIdx < pixelCount)
 					{
 						glm::vec3 newColor = dev_colorImage[offsetColorIdx];
 						currColor += newColor * dev_gausKernel[i];
@@ -312,46 +297,22 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 	}
 
-	__global__ void GenerateGaussianBlur2(
-		int num_paths, int filterSize,
-		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
-		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
-		const Camera cam
-	)
-	{
-
-		int x = (blockIdx.x * blockDim.x) + threadIdx.x;
-		int y = (blockIdx.y * blockDim.y) + threadIdx.y;
-
-		if (x < cam.resolution.x && y < cam.resolution.y)
-		{
-			int index = x + (y * cam.resolution.x);
-			glm::vec3 color = dev_colorImage[index];
-			glm::vec3 currColor = glm::vec3(0.0f);
-			//glm::vec3 currColor = dev_colorImage[index];
-			for (int i = 0; i < filterSize * filterSize; i++)
-			{
-				int offsetX = dev_offsetKernel[i].x;
-				int offsetY = dev_offsetKernel[i].y;
-				int finalValue_X = x + offsetX;
-				int finalValue_Y = y + offsetY;
-
-				finalValue_X = glm::clamp(finalValue_X, 0, cam.resolution.x - 1);
-				finalValue_Y = glm::clamp(finalValue_Y, 0, cam.resolution.y - 1);
-					float gausValue = dev_gausKernel[i];
-					int offsetColorIdx = finalValue_Y * cam.resolution.x + finalValue_X;
-				/*	if (offsetColorIdx >= 0 && offsetColorIdx < num_paths)
-					{*/
-						glm::vec3 newColor = dev_colorImage[offsetColorIdx];
-						currColor += newColor * dev_gausKernel[i];
-					/*}*/
-				
-			}
-			dev_TrousImage[index] = currColor;
-		}
-
-	}
-
+	/// <summary>
+	/// This A Trous Kernel performs conversion from 1D index to 2D.
+	/// </summary>
+	/// <param name="pixelCount"></param>
+	/// <param name="stepWidth"></param>
+	/// <param name="dev_gausKernel"></param>
+	/// <param name="dev_offsetKernel"></param>
+	/// <param name="dev_colorImage"></param>
+	/// <param name="dev_TrousImage"></param>
+	/// <param name="gbuf"></param>
+	/// <param name="resolutionX"></param>
+	/// <param name="resolutionY"></param>
+	/// <param name="ui_colorWeight"></param>
+	/// <param name="ui_normalWeight"></param>
+	/// <param name="ui_positionWeight"></param>
+	/// <returns></returns>
 	__global__ void GenerateAtrousImage(
 		int pixelCount, int stepWidth,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
@@ -386,10 +347,6 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 					int finalValue_X = index2D_x + offsetX * stepWidth; // Final Offset Values
 					int finalValue_Y = index2D_y + offsetY * stepWidth; // Final Offset Values
-
-					//finalValue_X = glm::clamp((float)finalValue_X, 0.0f, (float)resolutionX - 1.0f);
-					//finalValue_Y = glm::clamp((float)finalValue_Y, 0.0f, (float)resolutionY - 1.0f);
-
 					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
 					{
 						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
@@ -398,19 +355,16 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 							glm::vec3 ctmp = dev_colorImage[offsetColorIdx];
 							glm::vec3 t = cval - ctmp;
 							float dist2 = glm::dot(t, t);
-							float newVal = glm::exp(-1 * (dist2) / cphi);
 							float c_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 
 							glm::vec3 ntmp = gbuf[offsetColorIdx].normal;
 							t = nval - ntmp;
 							dist2 = glm::max(glm::dot(t, t)/ (stepWidth * stepWidth), 0.0f);
-							newVal = glm::exp(-1 * (dist2) / nphi );
 							float n_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 
 							glm::vec3 ptmp = gbuf[offsetColorIdx].position;
 							t = pval - ptmp;
 							dist2 = glm::dot(t, t);
-							newVal = glm::exp(-1 * (dist2) / pphi);
 							float p_w = glm::min(glm::exp(-(dist2) / cphi), 1.0f);
 							float weight = c_w * n_w * p_w;
 
@@ -420,13 +374,35 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 						}
 					}
 				}
-			
+				if (cum_w == 0.f)
+				{
+					dev_TrousImage[index] = cval;
+					return;
+				}
+				
 				dev_TrousImage[index] = sum / cum_w;
+				
 		}
 
 	}
-	
-	__global__ void GenerateAtrousImage2(
+	  
+	/// <summary>
+	/// This A Trous Kernel already starts out with 2D indexes
+	/// </summary>
+	/// <param name="pixelCount"></param>
+	/// <param name="stepWidth"></param>
+	/// <param name="dev_gausKernel"></param>
+	/// <param name="dev_offsetKernel"></param>
+	/// <param name="dev_colorImage"></param>
+	/// <param name="dev_TrousImage"></param>
+	/// <param name="gbuf"></param>
+	/// <param name="resolutionX"></param>
+	/// <param name="resolutionY"></param>
+	/// <param name="ui_colorWeight"></param>
+	/// <param name="ui_normalWeight"></param>
+	/// <param name="ui_positionWeight"></param>
+	/// <returns></returns>
+	__global__ void GenerateAtrousImageGeneral(
 		int pixelCount, int stepWidth,
 		float* dev_gausKernel, glm::vec2* dev_offsetKernel,
 		glm::vec3* dev_colorImage, glm::vec3* dev_TrousImage,
@@ -460,8 +436,6 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 
 					int finalValue_X  = x + offsetX * stepWidth;
 					int finalValue_Y = y + offsetY * stepWidth;
-					//int finalValue_X = glm::clamp ((float)x + (float)offsetX * stepWidth, 0.0f, (float)resolutionX - 1.0f);
-					//int finalValue_Y = glm::clamp((float)y + (float)offsetY * stepWidth, 0.0f, (float)resolutionY - 1.0f);
 					if (finalValue_X >= 0 && finalValue_X <= (resolutionX - 1) && finalValue_Y >= 0 && finalValue_Y <= (resolutionY - 1))
 					{
 						int offsetColorIdx = finalValue_Y * resolutionX + finalValue_X;
@@ -739,12 +713,6 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		// Assemble this iteration and apply it to the image
 		dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d;
 		finalGather << <numBlocksPixels, blockSize1d >> > (num_paths, dev_image, dev_paths);
-//		GenerateGaussianBlur2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize,dev_gausKernel, dev_offsetKernel,
-//			dev_image, dev_TrousImage, cam);
-		//GenerateAtrousImage << <numBlocksPixels, blockSize1d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
-		//	dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
-		// 		GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (num_paths, ui_filterSize ,dev_gausKernel, dev_offsetKernel,
-		//dev_image, dev_TrousImage, dev_gBuffer, cam, ui_colorWeight, ui_normalWeight, ui_positionWeight);
 		///////////////////////////////////////////////////////////////////////////
 
 		// CHECKITOUT: use dev_image as reference if you want to implement saving denoised images.
@@ -771,7 +739,7 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 		gbufferToPBO_Position <<<blocksPerGrid2d, blockSize2d>>>(pbo, cam.resolution, dev_gBuffer);
 	}
 
-	__global__ void GeneratePingPongImage(int pixelCount, glm::vec3* devImage, glm::vec3* pingPongImage)
+	__global__ void GeneratePingPongImage(int pixelCount, glm::vec3* devImage, glm::vec3* pingPongImage, int iter)
 	{
 		int index = (blockIdx.x * blockDim.x) + threadIdx.x;
 		if (index < pixelCount)
@@ -781,30 +749,51 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 	}
 
 
-	bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, int colWeight, int norWeight, int posWeight)
+	bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, float colWeight, float norWeight, float posWeight)
 	{
 		int pixelCount = resolutionX * resolutionY;
 		const int blockSize1d = 128;
 		dim3 numblocksPathSegmentTracing = (pixelCount + blockSize1d - 1) / blockSize1d;
 
-		// 2D block for generating ray from camera
+	
+
 		const dim3 blockSize2d(8, 8);
 		const dim3 blocksPerGrid2d(
 			(resolutionX + blockSize2d.x - 1) / blockSize2d.x,
 			(resolutionY + blockSize2d.y - 1) / blockSize2d.y);
 
 		int stepWidth = 1;
-		int blur_iterations = ceil(glm::log2((filterSize) / 4.f));
-		GeneratePingPongImage << < numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, dev_image, dev_pingPongImage);
+		//int blur_iterations = ceil(glm::log2( (filterSize/25.f) * (filterSize / 25.f)) ); // This wavelet scalling is the correct option though
+		int blur_iterations = ceil(glm::log2( (filterSize* filterSize) / 25.f) ); // This wavelet scalling worked best for me
+		blur_iterations = blur_iterations <= 0 ? 1 : blur_iterations;
+		int colorWeight = colWeight;
+
+		GeneratePingPongImage << < numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, dev_image, dev_pingPongImage, iteration);
 		for (int i = 1; i < blur_iterations; i ++)
 		{
 
+#if SimpleGaussianDenoise
+
+
+			GenerateGaussianBlur <<< numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, stepWidth, dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, resolutionX, resolutionY);
+
+#endif
+
+#if ATrous1
 			// 1D block for path tracing
 			GenerateAtrousImage << <numblocksPathSegmentTracing, blockSize1d >> > (pixelCount, stepWidth, dev_gausKernel, dev_offsetKernel,
+				dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colorWeight, norWeight, posWeight);
+#endif
+
+
+#if ATrous2
+			GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (pixelCount, i,dev_gausKernel, dev_offsetKernel,
 				dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colWeight, norWeight, posWeight);
+#endif
+
 			stepWidth *= 2;
-			//GenerateAtrousImage2 << <blocksPerGrid2d, blockSize2d >> > (pixelCount, i,dev_gausKernel, dev_offsetKernel,
-			//	dev_pingPongImage, dev_TrousImage, dev_gBuffer, resolutionX, resolutionY, colWeight, norWeight, posWeight);
+			colorWeight /= 2;
 			swap(dev_pingPongImage, dev_TrousImage);
 		}
 		swap(dev_pingPongImage, dev_TrousImage); // Getting the data back to dev_TrousImage
@@ -832,5 +821,5 @@ __global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, Path
 			(cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y);
 
 		// Send results to OpenGL buffer for rendering
-		sendImageToPBOCopy << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
+		sendImageToPBO << <blocksPerGrid2d, blockSize2d >> > (pbo, cam.resolution, iter, dev_TrousImage);
 	}
diff --git a/src/pathtrace.h b/src/pathtrace.h
index f9a7366..a2791df 100644
--- a/src/pathtrace.h
+++ b/src/pathtrace.h
@@ -10,4 +10,4 @@ void showGBuffer(uchar4 *pbo);
 void showImage(uchar4 *pbo, int iter);
 void showDenoise(uchar4 *pbo, int iter);
 
-bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, int colWeight, int norWeight, int posWeight);
\ No newline at end of file
+bool DenoiseImage(int resolutionX, int resolutionY, int iteration, int filterSize, float colWeight, float norWeight, float posWeight);
\ No newline at end of file

From 2f5b4c9f5b317dd35ccf2e5ea4832e6e21aa0099 Mon Sep 17 00:00:00 2001
From: Shubham Sharma <sharmashubham9800@gmail.com>
Date: Sun, 24 Oct 2021 01:55:28 -0400
Subject: [PATCH 14/14] Added TImer

---
 CMakeLists.txt |   2 +
 src/common.cu  |  40 +++++++++++++++
 src/common.h   | 132 +++++++++++++++++++++++++++++++++++++++++++++++++
 src/main.cpp   |  44 ++++++++++++++++-
 4 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 src/common.cu
 create mode 100644 src/common.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 162568b..ca9eb84 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(headers
     src/sceneStructs.h
     src/preview.h
     src/utilities.h
+src/common.h
     )
 
 set(sources
@@ -84,6 +85,7 @@ set(sources
     src/scene.cpp
     src/preview.cpp
     src/utilities.cpp
+src/common.cu
     )
 
 set(imgui
diff --git a/src/common.cu b/src/common.cu
new file mode 100644
index 0000000..7a83529
--- /dev/null
+++ b/src/common.cu
@@ -0,0 +1,40 @@
+#include "common.h"
+
+
+namespace StreamCompaction {
+    namespace Common {
+
+        /**
+         * Maps an array to an array of 0s and 1s for stream compaction. Elements
+         * which map to 0 will be removed, and elements which map to 1 will be kept.
+         */
+        __global__ void kernMapToBoolean(int n, int* bools1, int* bools2, const int* idata) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            int result = idata[index] != 0;
+            bools1[index] = result;
+            bools2[index] = result;
+        }
+
+        /**
+         * Performs scatter on an array. That is, for each element in idata,
+         * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]].
+         */
+        __global__ void kernScatter(int n, int* odata,
+            const int* idata, const int* bools, const int* indices) {
+            // TODO
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index]) {
+                odata[indices[index]] = idata[index];
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/common.h b/src/common.h
new file mode 100644
index 0000000..d2c1fed
--- /dev/null
+++ b/src/common.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include <algorithm>
+#include <chrono>
+#include <stdexcept>
+
+#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+
+/**
+ * Check for CUDA errors; print and exit if there was a problem.
+ */
+void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1);
+
+inline int ilog2(int x) {
+    int lg = 0;
+    while (x >>= 1) {
+        ++lg;
+    }
+    return lg;
+}
+
+inline int ilog2ceil(int x) {
+    return x == 1 ? 0 : ilog2(x - 1) + 1;
+}
+
+namespace StreamCompaction {
+    namespace Common {
+        __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
+
+        __global__ void kernScatter(int n, int *odata,
+                const int *idata, const int *bools, const int *indices);
+
+        /**
+        * This class is used for timing the performance
+        * Uncopyable and unmovable
+        *
+        * Adapted from WindyDarian(https://github.com/WindyDarian)
+        */
+        class PerformanceTimer
+        {
+        public:
+            PerformanceTimer()
+            {
+                cudaEventCreate(&event_start);
+                cudaEventCreate(&event_end);
+            }
+
+            ~PerformanceTimer()
+            {
+                cudaEventDestroy(event_start);
+                cudaEventDestroy(event_end);
+            }
+
+            void startCpuTimer()
+            {
+                if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); }
+                cpu_timer_started = true;
+
+                time_start_cpu = std::chrono::high_resolution_clock::now();
+            }
+
+            void endCpuTimer()
+            {
+                time_end_cpu = std::chrono::high_resolution_clock::now();
+
+                if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); }
+
+                std::chrono::duration<double, std::milli> duro = time_end_cpu - time_start_cpu;
+                prev_elapsed_time_cpu_milliseconds =
+                    static_cast<decltype(prev_elapsed_time_cpu_milliseconds)>(duro.count());
+
+                cpu_timer_started = false;
+            }
+
+            void startGpuTimer()
+            {
+                if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); }
+                gpu_timer_started = true;
+
+                cudaEventRecord(event_start);
+            }
+
+            void endGpuTimer()
+            {
+                cudaEventRecord(event_end);
+                cudaEventSynchronize(event_end);
+
+                if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); }
+
+                cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end);
+                gpu_timer_started = false;
+            }
+
+            float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015
+            {
+                return prev_elapsed_time_cpu_milliseconds;
+            }
+
+            float getGpuElapsedTimeForPreviousOperation() //noexcept
+            {
+                return prev_elapsed_time_gpu_milliseconds;
+            }
+
+            // remove copy and move functions
+            PerformanceTimer(const PerformanceTimer&) = delete;
+            PerformanceTimer(PerformanceTimer&&) = delete;
+            PerformanceTimer& operator=(const PerformanceTimer&) = delete;
+            PerformanceTimer& operator=(PerformanceTimer&&) = delete;
+
+        private:
+            cudaEvent_t event_start = nullptr;
+            cudaEvent_t event_end = nullptr;
+
+            using time_point_t = std::chrono::high_resolution_clock::time_point;
+            time_point_t time_start_cpu;
+            time_point_t time_end_cpu;
+
+            bool cpu_timer_started = false;
+            bool gpu_timer_started = false;
+
+            float prev_elapsed_time_cpu_milliseconds = 0.f;
+            float prev_elapsed_time_gpu_milliseconds = 0.f;
+        };
+    }
+}
diff --git a/src/main.cpp b/src/main.cpp
index d25d5cb..ce6124c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -5,7 +5,7 @@
 #include "../imgui/imgui.h"
 #include "../imgui/imgui_impl_glfw.h"
 #include "../imgui/imgui_impl_opengl3.h"
-
+#include "common.h"
 static std::string startTimeString;
 
 // For camera controls
@@ -46,6 +46,18 @@ int iteration;
 int width;
 int height;
 
+static float timePT;
+static float timeAT;
+static bool  hasPrinted;
+using StreamCompaction::Common::PerformanceTimer;
+#define TIMER 1
+
+PerformanceTimer& timer()
+{
+    static PerformanceTimer timer;
+    return timer;
+}
+
 void FilterCreation(int filter_size, float *kernel)
 {
     // initialising standard deviation to 1.0
@@ -189,9 +201,26 @@ void runCuda() {
     if (iteration < ui_iterations) {
         iteration++;
 
+#if TIMER
+        // Start Timer
+        if (iteration == 1)
+        {
+            timePT = 0.f;
+        }
+        timer().startCpuTimer();
+#endif // TIMER
+
         // execute the kernel
         int frame = 0;
         pathtrace(frame, iteration); 
+
+#if TIMER
+        timer().endCpuTimer();
+        timePT += timer().getCpuElapsedTimeForPreviousOperation();
+        if (iteration == ui_iterations) {
+            std::cout << "Path-trace time for " << iteration << " iterations: " << timePT << "ms" << std::endl;
+        }
+#endif // TIMER
     }
 
     if (ui_showGbuffer) {
@@ -201,8 +230,21 @@ void runCuda() {
     {
         if (!imageDenoised)
         {
+#if TIMER
+            // Start Timer
+            timeAT = 0.f;
+            if (!hasPrinted) {
+                timer().startCpuTimer();
+            }
+#endif // TIMER
             imageDenoised = DenoiseImage(renderState->camera.resolution.x, renderState->camera.resolution.y, iteration, ui_filterSize,
                 ui_colorWeight, ui_normalWeight, ui_positionWeight);
+
+#if TIMER
+                timer().endCpuTimer();
+                timeAT += timer().getCpuElapsedTimeForPreviousOperation();
+                std::cout << "Denoise time for " << iteration << " iterations: " << timeAT << "ms\n\n" << std::endl;
+#endif // TIMER
         }
         showDenoise(pbo_dptr, iteration);
     }