diff --git a/CMakeLists.txt b/CMakeLists.txt index 162568b..1aa258e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,7 @@ set(headers src/sceneStructs.h src/preview.h src/utilities.h + stream_compaction/common.h ) set(sources @@ -84,6 +85,7 @@ set(sources src/scene.cpp src/preview.cpp src/utilities.cpp + stream_compaction/common.cu ) set(imgui diff --git a/README.md b/README.md index f044c82..aecf29c 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,240 @@ -CUDA Denoiser For CUDA Path Tracer -================================== +Project 4 CUDA A Trous Denoiser +====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Raymond Yang + * [LinkedIn](https://www.linkedin.com/in/raymond-yang-b85b19168) + * Tested on: + * 10/22/2021 + * Windows 10 + * NVIDIA GeForce GTX 1080 Ti. + * Submitted on: 10/22/2021 -### (TODO: Your README) -*DO NOT* leave the README to the last minute! It is a crucial part of the -project, and we will not be able to grade you without a good README. +## Introduction +The objective of this project was to implement A Trous denoiser detailed in this [paper](https://jo.dreggn.org/home/2010_atrous.pdf). To construct the denoised image, we modulate a noisy path traced image (represented as a color buffer) with a geometry buffer/G buffer (represented as an array of positions and an array of normals). Both elements of the G buffer are cahced on first bounce and only done once per render. +### Components of A Trous Denoiser +Scene Colors: +

+ drawing +

+ +Scene Positions (relative to camera origin): +

+ drawing +

+ +Scene Normals: +

+ drawing +

+ +Scene Time to Intersect (unused): +

+ drawing +

+ +### Denoised Output +

+ drawing +

+ +## Performance Analysis +In this section, we compare the runtime costs of A Trous denoising, recorded in microseconds. For Simple Scene, we will use `scene/cornell_ceiling_light.txt`. For Slightly Less Simple Scene, we will use `scene/demo.txt`. For the purposes of consistency between runs, the following values will be hardcoded to the window unless otherwise specified: +* `Filter Size = 80` +* `Color Weight = 8.f` +* `Normal Weight = 1.f` +* `Position Weight = 1.5f` + +### Simple Scene +Baseline image for reference at 1000 iterations: +

+ drawing +

+Path trace time for 1000 iterations: 33679.50 ms +Denoise time for 1000 iterations: 62.97 ms + +Path traced image after 10 iterations: +

+ drawing +

+ +Path traced image after 10 iterations with A Trous denoiser: +

+ drawing +

+ +#### Cost of Running Denoiser vs Path Tracer +| Trial | 1 | 2 | 3 | 4 | 5 | +|-----------------|----------|----------|----------|----------|----------| +| Path Trace Time | 326.05 | 338.88 | 336.49 | 345.53 | 338.01 | +| Denoise Time | 62.95 | 64.02 | 62.62 | 64.47 | 64.56 | + +Lower is better: +

+ drawing +

+ +#### Cost of Running Denoiser vs Resolution +| Resolution | 256x256 | 512x512 | 1024x1024 | +|------------|-----------|-----------|-----------| +| Time (ms) | 4.76 | 20.41 | 116.61 | + +Lower is better: +

+ drawing +

+ +Lower is better: +

+ drawing +

+* The increase in time is proportional to the increase in pixelcount for smaller resolutions. + +#### Cost of Running Denoiser vs Filter Size +| Filter Size | 25 | 50 | 100 | +|-------------|-------|-------|-------| +| Time (ms) | 45.66 | 60.41 | 65.53 | + +Lower is better: +

+ drawing +

+ +* Note that the increase in time is not substanctial indicating that I am likely calculating the number of iterations required for A Trous denoise incorrectly. + +### Slightly Less Simple Scene +Baseline image for reference at 1000 iterations: +

+ drawing +

+Path trace time for 1000 iterations: 237650.00 ms +Denoise time for 1000 iterations: 86.21 ms + +Path traced image after 10 iterations: +

+ drawing +

+ +Path traced image after 10 iterations with A Trous denoiser: +

+ drawing +

+ +#### Cost of Running Denoiser vs Path Tracer +| Trial | 1 | 2 | 3 | 4 | 5 | +|-----------------|-----------|-----------|-----------|-----------|-----------| +| Path Trace Time | 2431.57 | 2365.28 | 2374.86 | 2480.54 | 2387.57 | +| Denoise Time | 86.39 | 87.75 | 87.64 | 87.86 | 85.97 | + +Lower is better: +

+ drawing +

+ +#### Cost of Running Denoiser vs Resolution +| Resolution | 256x256 | 512x512 | 1024x1024 | +|------------|-----------|-----------|-----------| +| Time (ms) | 6.61 | 30.41 | 134.44 | + +Lower is better: +

+ drawing +

+ +Lower is better: +

+ drawing +

+* The increase in time is proportional to the increase in pixelcount for tested resolutions. + +#### Cost of Running Denoiser vs Filter Size +| Filter Size | 25 | 50 | 100 | +|-------------|-------|-------|-------| +| Time (ms) | 64.50 | 82.70 | 91.55 | + +Lower is better: +

+ drawing +

+ +* Note that the increase in time is not substanctial indicating that I am likely calculating the number of iterations required for A Trous denoise incorrectly. + +## Visual Analysis +In this section, we compare visual effects of A Trous denoising. + +### Visual Results vs Filter Size +Filter size is the two dimensional area around a pixel. That is, for a filter size of 25 and a pixel located at `(i,j)` of the image, the surrounding `5x5` grid centered around the pixel are factored into the weighting A Trous denoiser. +* Since I am using a gaussian weighting spread of `5x5`, filter sizes below 25 are not considered because the area is smaller than the gaussian spread. +* Filter sizes in `[25, 81)`, are mapped `1:1` to the gaussian spread. Filter sizes in `[81, 100]` are mapped 2:1 to the gaussian spread. That is, the gaussian weight at `(-2, -2)` relative to the pixel will actually be the pixel located at `(-4, -4)` relative to the pixel. Likewise, `(1, 1)` is mapped to `(2, 2)`. +* NOTE: that this assumes I understand and implemented filter size correction. Given the observations above (runtime for a filter size of 50 is greater than runtime for a filter size of 25), it is very likely I have not implemented filter size correctly. + +`filterSize = 25`: +

+ drawing +

+ +`filterSize = 50`: +

+ drawing +

+ +`filterSize = 75`: +

+ drawing +

+ +`filterSize = 100`: +

+ drawing +

+ +The only substantive change seems to be between `filterSize = 25` and `filterSize = 50`. + +### Visual Results vs Material +From left to right, the spheres are of properties reflective, refractive (with index of 1.5), and diffuse. Scene can be found in file `/scenes/cornell_ceiling_light_plus.txt`. + +1000 iterations: +

+ drawing +

+ +10 iterations: +

+ drawing +

+ +10 iterations with A Trous diffuse: +

+ drawing +

+ +With my implementation, A Trous denoiser fails to adequately simulate refractive properties. It is moderately capable of simulating diffuse properties. It is capable of simulating reflective properties. + +### Visual Results vs Lighting +Small light uses scene: `/scenes/cornell.txt` +Large light uses scene: `/scenes/cornell_ceiling_light.txt` + +Cornell Box with small ceiling light: +

+ drawing +

+ +Cornell Box with large ceiling light: +

+ drawing +

+ +Cornell Box with small ceiling light and A Trous denoise: +

+ drawing +

+ +Cornell Box with large ceiling light and A Trous denoise: +

+ drawing +

+ +Given poorer lighting conditions, more "holes" are left in the image. As the A Trous denoiser interpolates neighboring colors for each pixel, it emphasizes darker colors leading to poorer results. \ No newline at end of file diff --git a/img/a0.PNG b/img/a0.PNG new file mode 100644 index 0000000..84c4ddd Binary files /dev/null and b/img/a0.PNG differ diff --git a/img/a1.PNG b/img/a1.PNG new file mode 100644 index 0000000..bb2e760 Binary files /dev/null and b/img/a1.PNG differ diff --git a/img/a2.PNG b/img/a2.PNG new file mode 100644 index 0000000..3190dbf Binary files /dev/null and b/img/a2.PNG differ diff --git a/img/a3.PNG b/img/a3.PNG new file mode 100644 index 0000000..81df20b Binary files /dev/null and b/img/a3.PNG differ diff --git a/img/a4.PNG b/img/a4.PNG new file mode 100644 index 0000000..f2f05c9 Binary files /dev/null and b/img/a4.PNG differ diff --git a/img/b0.PNG b/img/b0.PNG new file mode 100644 index 0000000..4c55315 Binary files /dev/null and b/img/b0.PNG differ diff --git a/img/b1.PNG b/img/b1.PNG new file mode 100644 index 0000000..ff9cf86 Binary files /dev/null and b/img/b1.PNG differ diff --git a/img/b2.PNG b/img/b2.PNG new file mode 100644 index 0000000..4638966 Binary files /dev/null and b/img/b2.PNG differ diff --git a/img/c0.PNG b/img/c0.PNG new file mode 100644 index 0000000..be94eec Binary files /dev/null and b/img/c0.PNG differ diff --git a/img/c1.PNG b/img/c1.PNG new file mode 100644 index 0000000..a142f83 Binary files /dev/null and b/img/c1.PNG differ diff --git a/img/c2.PNG b/img/c2.PNG new file mode 100644 index 0000000..76fc9f6 Binary files /dev/null and b/img/c2.PNG differ diff --git a/img/c3.PNG b/img/c3.PNG new file mode 100644 index 0000000..ed9b7d1 Binary files /dev/null and b/img/c3.PNG differ diff --git a/img/d0.PNG b/img/d0.PNG new file mode 100644 index 0000000..4022ab2 Binary files /dev/null and b/img/d0.PNG differ diff --git a/img/d1.PNG b/img/d1.PNG new file mode 100644 index 0000000..dc9d94c Binary files /dev/null and b/img/d1.PNG differ diff --git a/img/d2.PNG b/img/d2.PNG new file mode 100644 index 0000000..79a2eb6 Binary files /dev/null and b/img/d2.PNG differ diff --git a/img/e0.PNG b/img/e0.PNG new file mode 100644 index 0000000..128a81e Binary files /dev/null and b/img/e0.PNG differ diff --git a/img/e1.PNG b/img/e1.PNG new file mode 100644 index 0000000..fb17cfc Binary files /dev/null and b/img/e1.PNG differ diff --git a/img/e2.PNG b/img/e2.PNG new file mode 100644 index 0000000..7e122f5 Binary files /dev/null and b/img/e2.PNG differ diff --git a/img/e3.PNG b/img/e3.PNG new file mode 100644 index 0000000..0dd71e8 Binary files /dev/null and b/img/e3.PNG differ diff --git a/img/f0.PNG b/img/f0.PNG new file mode 100644 index 0000000..a2b7913 Binary files /dev/null and b/img/f0.PNG differ diff --git a/img/f1.PNG b/img/f1.PNG new file mode 100644 index 0000000..a35d9f5 Binary files /dev/null and b/img/f1.PNG differ diff --git a/img/f2.PNG b/img/f2.PNG new file mode 100644 index 0000000..4b0c1bb Binary files /dev/null and b/img/f2.PNG differ diff --git a/img/f3.PNG b/img/f3.PNG new file mode 100644 index 0000000..da4cc69 Binary files /dev/null and b/img/f3.PNG differ diff --git a/img/g0.PNG b/img/g0.PNG new file mode 100644 index 0000000..ed94eb8 Binary files /dev/null and b/img/g0.PNG differ diff --git a/img/g1.PNG b/img/g1.PNG new file mode 100644 index 0000000..b75bfe4 Binary files /dev/null and b/img/g1.PNG differ diff --git a/img/g2.PNG b/img/g2.PNG new file mode 100644 index 0000000..095929f Binary files /dev/null and b/img/g2.PNG differ diff --git a/img/h0.PNG b/img/h0.PNG new file mode 100644 index 0000000..4542536 Binary files /dev/null and b/img/h0.PNG differ diff --git a/img/h1.PNG b/img/h1.PNG new file mode 100644 index 0000000..4274c97 Binary files /dev/null and b/img/h1.PNG differ diff --git a/img/h2.PNG b/img/h2.PNG new file mode 100644 index 0000000..78d2d80 Binary files /dev/null and b/img/h2.PNG differ diff --git a/img/h3.PNG b/img/h3.PNG new file mode 100644 index 0000000..7214bcb Binary files /dev/null and b/img/h3.PNG differ diff --git a/scenes/cornell.txt b/scenes/cornell.txt index 83ff820..77ad551 100644 --- a/scenes/cornell.txt +++ b/scenes/cornell.txt @@ -52,7 +52,7 @@ EMITTANCE 0 CAMERA RES 800 800 FOVY 45 -ITERATIONS 5000 +ITERATIONS 10 DEPTH 8 FILE cornell EYE 0.0 5 10.5 diff --git a/scenes/cornell_ceiling_light_plus.txt b/scenes/cornell_ceiling_light_plus.txt new file mode 100644 index 0000000..54a596b --- /dev/null +++ b/scenes/cornell_ceiling_light_plus.txt @@ -0,0 +1,153 @@ +// Emissive material (light) +MATERIAL 0 +RGB 1 1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 1 + +// Diffuse white +MATERIAL 1 +RGB .98 .98 .98 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse red +MATERIAL 2 +RGB .85 .35 .35 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse green +MATERIAL 3 +RGB .35 .85 .35 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Specular white +MATERIAL 4 +RGB .98 .98 .98 +SPECEX 0 +SPECRGB .98 .98 .98 +REFL 1 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Refractive white +MATERIAL 5 +RGB .98 .98 .98 +SPECEX 0 +SPECRGB .98 .98 .98 +REFL 0 +REFR 1 +REFRIOR 1.5 +EMITTANCE 0 + +// Diffuse white +MATERIAL 6 +RGB .98 .98 .98 +SPECEX 0 +SPECRGB .98 .98 .98 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Camera +CAMERA +RES 800 800 +FOVY 45 +ITERATIONS 10 +DEPTH 8 +FILE cornell +EYE 0.0 5 10.5 +LOOKAT 0 5 0 +UP 0 1 0 + + +// Ceiling light +OBJECT 0 +cube +material 0 +TRANS 0 10 0 +ROTAT 0 0 0 +SCALE 10 .3 10 + +// Floor +OBJECT 1 +cube +material 1 +TRANS 0 0 0 +ROTAT 0 0 0 +SCALE 10 .01 10 + +// Ceiling +OBJECT 2 +cube +material 1 +TRANS 0 10 0 +ROTAT 0 0 90 +SCALE .01 10 10 + +// Back wall +OBJECT 3 +cube +material 1 +TRANS 0 5 -5 +ROTAT 0 90 0 +SCALE .01 10 10 + +// Left wall +OBJECT 4 +cube +material 2 +TRANS -5 5 0 +ROTAT 0 0 0 +SCALE .01 10 10 + +// Right wall +OBJECT 5 +cube +material 3 +TRANS 5 5 0 +ROTAT 0 0 0 +SCALE .01 10 10 + +// Sphere +OBJECT 6 +sphere +material 4 +TRANS -4 4 -1 +ROTAT 0 0 0 +SCALE 3 3 3 + +// Sphere +OBJECT 7 +sphere +material 5 +TRANS 0 4 -1 +ROTAT 0 0 0 +SCALE 3 3 3 + +// Sphere +OBJECT 8 +sphere +material 6 +TRANS 4 4 -1 +ROTAT 0 0 0 +SCALE 3 3 3 diff --git a/scenes/demo.txt b/scenes/demo.txt new file mode 100644 index 0000000..5534bba --- /dev/null +++ b/scenes/demo.txt @@ -0,0 +1,528 @@ +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Light White +MATERIAL 0 +RGB 1 1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 7 + +// Reflective +MATERIAL 1 +RGB 1 1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 1 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 2 +RGB 0.35 0.85 0.85 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 3 +RGB 0.85 0.35 0.85 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 4 +RGB 0.85 0.85 0.35 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 5 +RGB 0.35 0.85 0.35 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 6 +RGB 0.35 0.35 0.85 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Refractive +MATERIAL 7 +RGB 1 1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 1 +REFRIOR 1.5 +EMITTANCE 0 + +// Diffuse +MATERIAL 8 +RGB 0.98 0.98 0.98 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 9 +RGB 0.43 0.34 0.24 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Diffuse +MATERIAL 10 +RGB 0.5 0.5 0.5 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +// Light White +MATERIAL 11 +RGB 1 0.1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 7 + +// Light White +MATERIAL 12 +RGB 1 0.1 0.1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 7 + +// Light White +MATERIAL 13 +RGB 0.1 0.1 1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 7 + +// Light White +MATERIAL 14 +RGB 0.1 1 0.1 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0 +REFR 0 +REFRIOR 0 +EMITTANCE 7 + +// Diffuse Reflective +MATERIAL 15 +RGB 0.35 0.75 0.35 +SPECEX 0 +SPECRGB 0 0 0 +REFL 0.4 +REFR 0 +REFRIOR 0 +EMITTANCE 0 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Camera +CAMERA +RES 800 800 +FOVY 45 +ITERATIONS 10 +DEPTH 12 +FILE cornell +EYE 0 -10 50 +LOOKAT 0 30 0 +UP 0 1 0 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Ceiling light +OBJECT 0 +cube +material 11 +TRANS -20 80 0 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 1 +cube +material 11 +TRANS 20 80 0 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 2 +cube +material 0 +TRANS 0 80 0 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 3 +cube +material 12 +TRANS -20 80 -20 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 4 +cube +material 13 +TRANS 20 80 -20 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 5 +cube +material 14 +TRANS 0 80 -20 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 6 +cube +material 13 +TRANS -20 80 20 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 7 +cube +material 12 +TRANS 20 80 20 +ROTAT 0 0 0 +SCALE 10 1 10 + +// Ceiling light +OBJECT 8 +cube +material 14 +TRANS 0 80 20 +ROTAT 0 0 0 +SCALE 10 1 10 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Cube +OBJECT 8 +cube +material 7 +TRANS -12 38.960193405 0 +ROTAT 4 60 53 +SCALE 3.375 3.375 3.375 + +// Cube +OBJECT 9 +cube +material 1 +TRANS -12 32.9939799387 0 +ROTAT 42 25 28 +SCALE 5.0625 5.0625 5.0625 + +// Cube +OBJECT 10 +cube +material 7 +TRANS -12 24.0446597393 0 +ROTAT 19 63 72 +SCALE 7.59375 7.59375 7.59375 + +// Cube +OBJECT 11 +cube +material 7 +TRANS -12 15.0953395399 0 +ROTAT 32 0 58 +SCALE 5.0625 5.0625 5.0625 + +// Cube +OBJECT 12 +cube +material 7 +TRANS -12 9.12912607362 0 +ROTAT 80 14 27 +SCALE 3.375 3.375 3.375 + +// Cube +OBJECT 13 +cube +material 1 +TRANS -12 5.15165042945 0 +ROTAT 67 69 47 +SCALE 2.25 2.25 2.25 + +// Cube +OBJECT 14 +cube +material 7 +TRANS -12 2.5 0 +ROTAT 73 3 57 +SCALE 1.5 1.5 1.5 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Cube +OBJECT 15 +cube +material 1 +TRANS 12 47.6954830935 0 +ROTAT 65 29 48 +SCALE 3.375 3.375 3.375 + +// Cube +OBJECT 16 +cube +material 1 +TRANS 12 41.7292696272 0 +ROTAT 44 16 24 +SCALE 5.0625 5.0625 5.0625 + +// Cube +OBJECT 17 +cube +material 7 +TRANS 12 35.7630561609 0 +ROTAT 34 41 88 +SCALE 3.375 3.375 3.375 + +// Cube +OBJECT 18 +cube +material 7 +TRANS 12 31.7855805167 0 +ROTAT 52 59 49 +SCALE 2.25 2.25 2.25 + +// Cube +OBJECT 19 +cube +material 7 +TRANS 12 29.1339300873 0 +ROTAT 84 70 54 +SCALE 1.5 1.5 1.5 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Cube +OBJECT 20 +cube +material 7 +TRANS 0 43.3278382492 12 +ROTAT 18 21 68 +SCALE 7.529536 7.529536 7.529536 + +// Cube +OBJECT 21 +cube +material 7 +TRANS 0 35.6914949854 12 +ROTAT 71 41 16 +SCALE 5.37824 5.37824 5.37824 + +// Cube +OBJECT 22 +cube +material 7 +TRANS 0 30.2369640827 12 +ROTAT 36 20 1 +SCALE 3.8416 3.8416 3.8416 + +// Cube +OBJECT 23 +cube +material 1 +TRANS 0 26.3408705807 12 +ROTAT 87 76 24 +SCALE 2.744 2.744 2.744 + +// Cube +OBJECT 24 +cube +material 1 +TRANS 0 23.5579466507 12 +ROTAT 34 54 6 +SCALE 1.96 1.96 1.96 + +// Cube +OBJECT 25 +cube +material 7 +TRANS 0 21.5701438436 12 +ROTAT 62 74 44 +SCALE 1.4 1.4 1.4 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Cube +OBJECT 26 +cube +material 7 +TRANS 0 43.3278382492 -12 +ROTAT 55 26 0 +SCALE 7.529536 7.529536 7.529536 + +// Cube +OBJECT 27 +cube +material 1 +TRANS 0 35.6914949854 -12 +ROTAT 29 75 49 +SCALE 5.37824 5.37824 5.37824 + +// Cube +OBJECT 28 +cube +material 7 +TRANS 0 30.2369640827 -12 +ROTAT 13 40 53 +SCALE 3.8416 3.8416 3.8416 + +// Cube +OBJECT 29 +cube +material 7 +TRANS 0 26.3408705807 -12 +ROTAT 82 83 8 +SCALE 2.744 2.744 2.744 + +// Cube +OBJECT 30 +cube +material 7 +TRANS 0 23.5579466507 -12 +ROTAT 39 88 60 +SCALE 1.96 1.96 1.96 + +// Cube +OBJECT 31 +cube +material 1 +TRANS 0 21.5701438436 -12 +ROTAT 4 15 84 +SCALE 1.4 1.4 1.4 + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// Cube +OBJECT 32 +cube +material 9 +TRANS 0 53.3278382492 0 +ROTAT 0 0 20 +SCALE 27 1.3 3 + +// Cube +OBJECT 33 +cube +material 9 +TRANS 0 54.5278382492 0 +ROTAT 0 0 20 +SCALE 3 1.3 27 + +// Cube +OBJECT 34 +cube +material 8 +TRANS -12 27.9139191246 0 +ROTAT 0 0 0 +SCALE 0.5 45.8278382492 0.5 + +// Cube +OBJECT 35 +cube +material 8 +TRANS 12 45.2308841683 0 +ROTAT 0 0 0 +SCALE 0.5 28.1939081619 0.5 + +// Cube +OBJECT 36 +cube +material 8 +TRANS 0 38.2308841683 12 +ROTAT 0 0 0 +SCALE 0.5 35.1939081619 0.5 + +// Cube +OBJECT 37 +cube +material 8 +TRANS 0 38.2308841683 -12 +ROTAT 0 0 0 +SCALE 0.5 35.1939081619 0.5 + + + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +// World +OBJECT 38 +cube +material 15 +TRANS 0 -10 0 +ROTAT 0 0 0 +SCALE 200 200 200 \ No newline at end of file diff --git a/src/interactions.h b/src/interactions.h index 144a9f5..f11c08f 100644 --- a/src/interactions.h +++ b/src/interactions.h @@ -1,5 +1,6 @@ #pragma once +#define EPSILON_SCALE 10 #include "intersections.h" /** @@ -45,18 +46,52 @@ glm::vec3 calculateRandomDirectionInHemisphere( */ __host__ __device__ void scatterRay( - PathSegment & pathSegment, - glm::vec3 intersect, - glm::vec3 normal, - const Material &m, - thrust::default_random_engine &rng) { - glm::vec3 newDirection; - if (m.hasReflective) { - newDirection = glm::reflect(pathSegment.ray.direction, normal); - } else { - newDirection = calculateRandomDirectionInHemisphere(normal, rng); + PathSegment& pathSegment, + glm::vec3 intersect, + glm::vec3 normal, + const Material& m, + thrust::default_random_engine& rng) { + // TODO: implement this. + // A basic implementation of pure-diffuse shading will just call the + // calculateRandomDirectionInHemisphere defined above. + + thrust::uniform_real_distribution u01(0, 1); + float prob = u01(rng); + glm::vec3 rayVec = glm::normalize(pathSegment.ray.direction); + glm::vec3 norVec = glm::normalize(normal); + + if (prob < m.hasReflective) { + // Reflection + pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * norVec; + pathSegment.ray.direction = glm::normalize(glm::reflect(rayVec, norVec)); } + else if (prob < (m.hasReflective + m.hasRefractive)) { + // Refraction + // Reference: https://raytracing.github.io/books/RayTracingInOneWeekend.html#dielectrics + float refractionRatio = (glm::dot(rayVec, norVec) > 0) ? m.indexOfRefraction : (1.0f / m.indexOfRefraction); // inside sphere : outside sphere - pathSegment.ray.direction = newDirection; - pathSegment.ray.origin = intersect + (newDirection * 0.0001f); -} + float cosTheta = glm::min(glm::dot(-1.0f * rayVec, norVec), 1.0f); + float sinTheta = sqrt(1.0 - cosTheta * cosTheta); + bool cannotRefract = refractionRatio * sinTheta > 1.0f; + + float r0 = (1 - refractionRatio) / (1 + refractionRatio); + r0 *= r0; + float schlickAppro = r0 + (1 - r0) * pow(1 - cosTheta, 5); + bool schlickBool = schlickAppro > u01(rng); + + pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * rayVec; + if (cannotRefract || schlickBool) { + pathSegment.ray.direction = glm::reflect(rayVec, norVec); + } + else { + // Snell's Law + pathSegment.ray.direction = glm::refract(rayVec, norVec, refractionRatio); + } + } + else { + // Diffusion + pathSegment.ray.origin = intersect + (float)EPSILON * EPSILON_SCALE * norVec; + pathSegment.ray.direction = glm::normalize(calculateRandomDirectionInHemisphere(norVec, rng)); + } + pathSegment.color *= m.color; +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 4092ae4..d7a10d7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,6 +6,13 @@ #include "../imgui/imgui_impl_glfw.h" #include "../imgui/imgui_impl_opengl3.h" +/******************************************************************* +* TIMER 0 for no timer +* TIMER 1 for timing path tracer and denoise +*******************************************************************/ +#define TIMER 1 + + static std::string startTimeString; // For camera controls @@ -25,9 +32,9 @@ int lastLoopIterations = 0; bool ui_showGbuffer = false; bool ui_denoise = false; int ui_filterSize = 80; -float ui_colorWeight = 0.45f; -float ui_normalWeight = 0.35f; -float ui_positionWeight = 0.2f; +float ui_colorWeight = 8.f; +float ui_normalWeight = 1.f; +float ui_positionWeight = 1.5f; bool ui_saveAndExit = false; static bool camchanged = true; @@ -45,6 +52,17 @@ int iteration; int width; int height; +static float timePT; +static float timeAT; +static bool hasPrinted; +using StreamCompaction::Common::PerformanceTimer; +PerformanceTimer& timer() +{ + static PerformanceTimer timer; + return timer; +} + + //------------------------------- //-------------MAIN-------------- //------------------------------- @@ -144,12 +162,15 @@ void runCuda() { cameraPosition += cam.lookAt; cam.position = cameraPosition; camchanged = false; + + //std::cout << cam.position.x << " " << cam.position.y << " " << cam.position.z << " " << std::endl; } // Map OpenGL buffer object for writing from CUDA on a single GPU - // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer + // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer if (iteration == 0) { + hasPrinted = false; pathtraceFree(); pathtraceInit(scene); } @@ -157,18 +178,53 @@ void runCuda() { uchar4 *pbo_dptr = NULL; cudaGLMapBufferObject((void**)&pbo_dptr, pbo); + int num_paths = -1; if (iteration < ui_iterations) { iteration++; // execute the kernel int frame = 0; - pathtrace(frame, iteration); +#if TIMER + // Start Timer + if (iteration == 1) + { + timePT = 0.f; + } + timer().startCpuTimer(); +#endif // TIMER + num_paths = pathtrace(frame, iteration, iteration == ui_iterations); +#if TIMER + timer().endCpuTimer(); + timePT += timer().getCpuElapsedTimeForPreviousOperation(); + if (iteration == ui_iterations) { + std::cout << "Path-trace time for " << iteration << " iterations: " << timePT << "ms" << std::endl; + } +#endif // TIMER + } + + if (iteration == ui_iterations) { +#if TIMER + // Start Timer + timeAT = 0.f; + if (!hasPrinted) { + timer().startCpuTimer(); + } +#endif // TIMER + denoise(ui_filterSize, ui_colorWeight, ui_positionWeight, ui_normalWeight); +#if TIMER + if (!hasPrinted) { + hasPrinted = true; + timer().endCpuTimer(); + timeAT += timer().getCpuElapsedTimeForPreviousOperation(); + std::cout << "Denoise time for " << iteration << " iterations: " << timeAT << "ms\n\n" << std::endl; + } +#endif // TIMER } if (ui_showGbuffer) { showGBuffer(pbo_dptr); } else { - showImage(pbo_dptr, iteration); + showImage(pbo_dptr, iteration, ui_denoise); } // unmap buffer object diff --git a/src/main.h b/src/main.h index 06d311a..8e3f281 100644 --- a/src/main.h +++ b/src/main.h @@ -19,6 +19,7 @@ #include "pathtrace.h" #include "utilities.h" #include "scene.h" +#include "../stream_compaction/common.h" using namespace std; @@ -46,3 +47,4 @@ void runCuda(); void keyCallback(GLFWwindow *window, int key, int scancode, int action, int mods); void mousePositionCallback(GLFWwindow* window, double xpos, double ypos); void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods); + diff --git a/src/pathtrace.cu b/src/pathtrace.cu index 23e5f90..f2dcbd7 100644 --- a/src/pathtrace.cu +++ b/src/pathtrace.cu @@ -1,433 +1,584 @@ -#include -#include -#include -#include -#include -#include - -#include "sceneStructs.h" -#include "scene.h" -#include "glm/glm.hpp" -#include "glm/gtx/norm.hpp" -#include "utilities.h" -#include "pathtrace.h" -#include "intersections.h" -#include "interactions.h" - -#define ERRORCHECK 1 - -#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) -#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -void checkCUDAErrorFn(const char *msg, const char *file, int line) { -#if ERRORCHECK - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } - - fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } - fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); -# ifdef _WIN32 - getchar(); -# endif - exit(EXIT_FAILURE); -#endif -} - -__host__ __device__ -thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) { - int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index); - return thrust::default_random_engine(h); -} - -//Kernel that writes the image to the OpenGL PBO directly. -__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, - int iter, glm::vec3* image) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - - if (x < resolution.x && y < resolution.y) { - int index = x + (y * resolution.x); - glm::vec3 pix = image[index]; - - glm::ivec3 color; - color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255); - color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255); - color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255); - - // Each thread writes one pixel location in the texture (textel) - pbo[index].w = 0; - pbo[index].x = color.x; - pbo[index].y = color.y; - pbo[index].z = color.z; - } -} - -__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) { - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - - if (x < resolution.x && y < resolution.y) { - int index = x + (y * resolution.x); - float timeToIntersect = gBuffer[index].t * 256.0; - - pbo[index].w = 0; - pbo[index].x = timeToIntersect; - pbo[index].y = timeToIntersect; - pbo[index].z = timeToIntersect; - } -} - -static Scene * hst_scene = NULL; -static glm::vec3 * dev_image = NULL; -static Geom * dev_geoms = NULL; -static Material * dev_materials = NULL; -static PathSegment * dev_paths = NULL; -static ShadeableIntersection * dev_intersections = NULL; -static GBufferPixel* dev_gBuffer = NULL; -// TODO: static variables for device memory, any extra info you need, etc -// ... - -void pathtraceInit(Scene *scene) { - hst_scene = scene; - const Camera &cam = hst_scene->state.camera; - const int pixelcount = cam.resolution.x * cam.resolution.y; - - cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3)); - cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3)); - - cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment)); - - cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom)); - cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice); - - cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material)); - cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice); - - cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection)); - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); - - cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel)); - - // TODO: initialize any extra device memeory you need - - checkCUDAError("pathtraceInit"); -} - -void pathtraceFree() { - cudaFree(dev_image); // no-op if dev_image is null - cudaFree(dev_paths); - cudaFree(dev_geoms); - cudaFree(dev_materials); - cudaFree(dev_intersections); - cudaFree(dev_gBuffer); - // TODO: clean up any extra device memory you created - - checkCUDAError("pathtraceFree"); -} - -/** -* Generate PathSegments with rays from the camera through the screen into the -* scene, which is the first bounce of rays. -* -* Antialiasing - add rays for sub-pixel sampling -* motion blur - jitter rays "in time" -* lens effect - jitter ray origin positions based on a lens -*/ -__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments) -{ - int x = (blockIdx.x * blockDim.x) + threadIdx.x; - int y = (blockIdx.y * blockDim.y) + threadIdx.y; - - if (x < cam.resolution.x && y < cam.resolution.y) { - int index = x + (y * cam.resolution.x); - PathSegment & segment = pathSegments[index]; - - segment.ray.origin = cam.position; - segment.color = glm::vec3(1.0f, 1.0f, 1.0f); - - segment.ray.direction = glm::normalize(cam.view - - cam.right * cam.pixelLength.x * ((float)x - (float)cam.resolution.x * 0.5f) - - cam.up * cam.pixelLength.y * ((float)y - (float)cam.resolution.y * 0.5f) - ); - - segment.pixelIndex = index; - segment.remainingBounces = traceDepth; - } -} - -__global__ void computeIntersections( - int depth - , int num_paths - , PathSegment * pathSegments - , Geom * geoms - , int geoms_size - , ShadeableIntersection * intersections - ) -{ - int path_index = blockIdx.x * blockDim.x + threadIdx.x; - - if (path_index < num_paths) - { - PathSegment pathSegment = pathSegments[path_index]; - - float t; - glm::vec3 intersect_point; - glm::vec3 normal; - float t_min = FLT_MAX; - int hit_geom_index = -1; - bool outside = true; - - glm::vec3 tmp_intersect; - glm::vec3 tmp_normal; - - // naive parse through global geoms - - for (int i = 0; i < geoms_size; i++) - { - Geom & geom = geoms[i]; - - if (geom.type == CUBE) - { - t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - else if (geom.type == SPHERE) - { - t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); - } - - // Compute the minimum t from the intersection tests to determine what - // scene geometry object was hit first. - if (t > 0.0f && t_min > t) - { - t_min = t; - hit_geom_index = i; - intersect_point = tmp_intersect; - normal = tmp_normal; - } - } - - if (hit_geom_index == -1) - { - intersections[path_index].t = -1.0f; - } - else - { - //The ray hits something - intersections[path_index].t = t_min; - intersections[path_index].materialId = geoms[hit_geom_index].materialid; - intersections[path_index].surfaceNormal = normal; - } - } -} - -__global__ void shadeSimpleMaterials ( - int iter - , int num_paths - , ShadeableIntersection * shadeableIntersections - , PathSegment * pathSegments - , Material * materials - ) -{ - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_paths) - { - ShadeableIntersection intersection = shadeableIntersections[idx]; - PathSegment segment = pathSegments[idx]; - if (segment.remainingBounces == 0) { - return; - } - - if (intersection.t > 0.0f) { // if the intersection exists... - segment.remainingBounces--; - // Set up the RNG - thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces); - - Material material = materials[intersection.materialId]; - glm::vec3 materialColor = material.color; - - // If the material indicates that the object was a light, "light" the ray - if (material.emittance > 0.0f) { - segment.color *= (materialColor * material.emittance); - segment.remainingBounces = 0; - } - else { - segment.color *= materialColor; - glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin; - scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng); - } - // If there was no intersection, color the ray black. - // Lots of renderers use 4 channel color, RGBA, where A = alpha, often - // used for opacity, in which case they can indicate "no opacity". - // This can be useful for post-processing and image compositing. - } else { - segment.color = glm::vec3(0.0f); - segment.remainingBounces = 0; - } - - pathSegments[idx] = segment; - } -} - -__global__ void generateGBuffer ( - int num_paths, - ShadeableIntersection* shadeableIntersections, - PathSegment* pathSegments, - GBufferPixel* gBuffer) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_paths) - { - gBuffer[idx].t = shadeableIntersections[idx].t; - } -} - -// Add the current iteration's output to the overall image -__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths) -{ - int index = (blockIdx.x * blockDim.x) + threadIdx.x; - - if (index < nPaths) - { - PathSegment iterationPath = iterationPaths[index]; - image[iterationPath.pixelIndex] += iterationPath.color; - } -} - -/** - * Wrapper for the __global__ call that sets up the kernel calls and does a ton - * of memory management - */ -void pathtrace(int frame, int iter) { - const int traceDepth = hst_scene->state.traceDepth; - const Camera &cam = hst_scene->state.camera; - const int pixelcount = cam.resolution.x * cam.resolution.y; - - // 2D block for generating ray from camera - const dim3 blockSize2d(8, 8); - const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); - - // 1D block for path tracing - const int blockSize1d = 128; - - /////////////////////////////////////////////////////////////////////////// - - // Pathtracing Recap: - // * Initialize array of path rays (using rays that come out of the camera) - // * You can pass the Camera object to that kernel. - // * Each path ray must carry at minimum a (ray, color) pair, - // * where color starts as the multiplicative identity, white = (1, 1, 1). - // * This has already been done for you. - // * NEW: For the first depth, generate geometry buffers (gbuffers) - // * For each depth: - // * Compute an intersection in the scene for each path ray. - // A very naive version of this has been implemented for you, but feel - // free to add more primitives and/or a better algorithm. - // Currently, intersection distance is recorded as a parametric distance, - // t, or a "distance along the ray." t = -1.0 indicates no intersection. - // * Color is attenuated (multiplied) by reflections off of any object - // * Stream compact away all of the terminated paths. - // You may use either your implementation or `thrust::remove_if` or its - // cousins. - // * Note that you can't really use a 2D kernel launch any more - switch - // to 1D. - // * Shade the rays that intersected something or didn't bottom out. - // That is, color the ray by performing a color computation according - // to the shader, then generate a new ray to continue the ray path. - // We recommend just updating the ray's PathSegment in place. - // Note that this step may come before or after stream compaction, - // since some shaders you write may also cause a path to terminate. - // * Finally: - // * if not denoising, add this iteration's results to the image - // * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl - - generateRayFromCamera <<>>(cam, iter, traceDepth, dev_paths); - checkCUDAError("generate camera ray"); - - int depth = 0; - PathSegment* dev_path_end = dev_paths + pixelcount; - int num_paths = dev_path_end - dev_paths; - - // --- PathSegment Tracing Stage --- - // Shoot ray into scene, bounce between objects, push shading chunks - - // Empty gbuffer - cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel)); - - // clean shading chunks - cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); - - bool iterationComplete = false; - while (!iterationComplete) { - - // tracing - dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; - computeIntersections <<>> ( - depth - , num_paths - , dev_paths - , dev_geoms - , hst_scene->geoms.size() - , dev_intersections - ); - checkCUDAError("trace one bounce"); - cudaDeviceSynchronize(); - - if (depth == 0) { - generateGBuffer<<>>(num_paths, dev_intersections, dev_paths, dev_gBuffer); - } - - depth++; - - shadeSimpleMaterials<<>> ( - iter, - num_paths, - dev_intersections, - dev_paths, - dev_materials - ); - iterationComplete = depth == traceDepth; - } - - // Assemble this iteration and apply it to the image - dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d; - finalGather<<>>(num_paths, dev_image, dev_paths); - - /////////////////////////////////////////////////////////////////////////// - - // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images. - // Otherwise, screenshots are also acceptable. - // Retrieve image from GPU - cudaMemcpy(hst_scene->state.image.data(), dev_image, - pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); - - checkCUDAError("pathtrace"); -} - -// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging. -void showGBuffer(uchar4* pbo) { - const Camera &cam = hst_scene->state.camera; - const dim3 blockSize2d(8, 8); - const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); - - // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization - gbufferToPBO<<>>(pbo, cam.resolution, dev_gBuffer); -} - -void showImage(uchar4* pbo, int iter) { -const Camera &cam = hst_scene->state.camera; - const dim3 blockSize2d(8, 8); - const dim3 blocksPerGrid2d( - (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, - (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); - - // Send results to OpenGL buffer for rendering - sendImageToPBO<<>>(pbo, cam.resolution, iter, dev_image); -} +#include +#include +#include +#include +#include +#include + +#include "sceneStructs.h" +#include "scene.h" +#include "glm/glm.hpp" +#include "glm/gtx/norm.hpp" +#include "utilities.h" +#include "pathtrace.h" +#include "intersections.h" +#include "interactions.h" + +#define ERRORCHECK 0 + +/******************************************************************* +* GBUFFER_RENDER 0 for gbufferToPBO to render intersects as color +* GBUFFER_RENDER 1 for gbufferToPBO to render positions as color +* GBUFFER_RENDER 2 for gbufferToPBO to render normals as color +*******************************************************************/ +#define GBUFFER_RENDER 0 + +/******************************************************************* +* BLOCK_LENGTH is 1D lenght of blocks per grid +*******************************************************************/ +#define BLOCK_LENGTH 8 +// TODO: update this to be variable length given FILTER_LENGTH +const float gaussian[25] = { + 0.003765, 0.015019, 0.023792, 0.015019, 0.003765, + 0.015019, 0.059912, 0.094907, 0.059912, 0.015019, + 0.023792, 0.094907, 0.150342, 0.094907, 0.023792, + 0.015019, 0.059912, 0.094907, 0.059912, 0.015019, + 0.003765, 0.015019, 0.023792, 0.015019, 0.003765 }; + +#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +void checkCUDAErrorFn(const char *msg, const char *file, int line) { +#if ERRORCHECK + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if (cudaSuccess == err) { + return; + } + + fprintf(stderr, "CUDA error"); + if (file) { + fprintf(stderr, " (%s:%d)", file, line); + } + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); +# ifdef _WIN32 + getchar(); +# endif + exit(EXIT_FAILURE); +#endif +} + +__host__ __device__ +thrust::default_random_engine makeSeededRandomEngine(int iter, int index, int depth) { + int h = utilhash((1 << 31) | (depth << 22) | iter) ^ utilhash(index); + return thrust::default_random_engine(h); +} + +//Kernel that writes the image to the OpenGL PBO directly. +__global__ void sendImageToPBO(uchar4* pbo, glm::ivec2 resolution, int iter, glm::vec3* image) +{ + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < resolution.x && y < resolution.y) { + int index = x + (y * resolution.x); + glm::vec3 pix = image[index]; + + glm::ivec3 color; + color.x = glm::clamp((int) (pix.x / iter * 255.0), 0, 255); + color.y = glm::clamp((int) (pix.y / iter * 255.0), 0, 255); + color.z = glm::clamp((int) (pix.z / iter * 255.0), 0, 255); + + // Each thread writes one pixel location in the texture (textel) + pbo[index].w = 0; + pbo[index].x = color.x; + pbo[index].y = color.y; + pbo[index].z = color.z; + } +} + +__global__ void gbufferToPBO(uchar4* pbo, glm::ivec2 resolution, GBufferPixel* gBuffer) { + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < resolution.x && y < resolution.y) { + int index = x + (y * resolution.x); + +#if GBUFFER_RENDER == 0 + // Intersect + float timeToIntersect = gBuffer[index].t * 255.f; + pbo[index].w = 0; + pbo[index].x = timeToIntersect; + pbo[index].y = timeToIntersect; + pbo[index].z = timeToIntersect; +#elif GBUFFER_RENDER == 1 + glm::vec4 posToColor = glm::vec4(glm::normalize(glm::abs(gBuffer[index].pos)) * 255.f, 0.f); + pbo[index].w = posToColor.w; + pbo[index].x = posToColor.x; + pbo[index].y = posToColor.y; + pbo[index].z = posToColor.z; +#elif GBUFFER_RENDER == 2 + glm::vec4 norToColor = glm::vec4(glm::normalize(glm::abs(gBuffer[index].nor)) * 255.f, 0.f); + pbo[index].w = norToColor.w; + pbo[index].x = norToColor.x; + pbo[index].y = norToColor.y; + pbo[index].z = norToColor.z; +#endif // GBUFFER_RENDER + } +} + +static Scene* hst_scene = NULL; +static glm::vec3* dev_image = NULL; +static glm::vec3* dev_imageDenoise = NULL; +static glm::vec3* dev_imageDenoiseDup = NULL; +static Geom* dev_geoms = NULL; +static Material* dev_materials = NULL; +static PathSegment* dev_paths = NULL; +static ShadeableIntersection* dev_intersections = NULL; +static GBufferPixel* dev_gBuffer = NULL; +static float* dev_gaussian = NULL; +// TODO: static variables for device memory, any extra info you need, etc +// ... + +void pathtraceInit(Scene *scene) { + hst_scene = scene; + const Camera &cam = hst_scene->state.camera; + const int pixelcount = cam.resolution.x * cam.resolution.y; + + cudaMalloc(&dev_image, pixelcount * sizeof(glm::vec3)); + cudaMemset(dev_image, 0, pixelcount * sizeof(glm::vec3)); + + cudaMalloc(&dev_imageDenoise, pixelcount * sizeof(glm::vec3)); + cudaMemset(dev_imageDenoise, 0, pixelcount * sizeof(glm::vec3)); + + cudaMalloc(&dev_imageDenoiseDup, pixelcount * sizeof(glm::vec3)); + cudaMemset(dev_imageDenoiseDup, 0, pixelcount * sizeof(glm::vec3)); + + cudaMalloc(&dev_paths, pixelcount * sizeof(PathSegment)); + + cudaMalloc(&dev_geoms, scene->geoms.size() * sizeof(Geom)); + cudaMemcpy(dev_geoms, scene->geoms.data(), scene->geoms.size() * sizeof(Geom), cudaMemcpyHostToDevice); + + cudaMalloc(&dev_materials, scene->materials.size() * sizeof(Material)); + cudaMemcpy(dev_materials, scene->materials.data(), scene->materials.size() * sizeof(Material), cudaMemcpyHostToDevice); + + cudaMalloc(&dev_intersections, pixelcount * sizeof(ShadeableIntersection)); + cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); + + cudaMalloc(&dev_gBuffer, pixelcount * sizeof(GBufferPixel)); + + cudaMalloc(&dev_gaussian, 25 * sizeof(float)); + cudaMemcpy(dev_gaussian, gaussian, 25 * sizeof(float), cudaMemcpyHostToDevice); + + // TODO: initialize any extra device memeory you need + + checkCUDAError("pathtraceInit"); +} + +void pathtraceFree() { + cudaFree(dev_image); // no-op if dev_image is null + cudaFree(dev_imageDenoise); + cudaFree(dev_imageDenoiseDup); + cudaFree(dev_paths); + cudaFree(dev_geoms); + cudaFree(dev_materials); + cudaFree(dev_intersections); + cudaFree(dev_gBuffer); + cudaFree(dev_gaussian); + // TODO: clean up any extra device memory you created + + checkCUDAError("pathtraceFree"); +} + +/** +* Generate PathSegments with rays from the camera through the screen into the +* scene, which is the first bounce of rays. +* +* Antialiasing - add rays for sub-pixel sampling +* motion blur - jitter rays "in time" +* lens effect - jitter ray origin positions based on a lens +*/ +__global__ void generateRayFromCamera(Camera cam, int iter, int traceDepth, PathSegment* pathSegments) +{ + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < cam.resolution.x && y < cam.resolution.y) { + int index = x + (y * cam.resolution.x); + PathSegment & segment = pathSegments[index]; + + thrust::default_random_engine rng = makeSeededRandomEngine(iter, index, 0); + thrust::uniform_real_distribution uhh(-0.5f, 0.5f); + + segment.ray.origin = cam.position; + segment.color = glm::vec3(1.0f, 1.0f, 1.0f); + + segment.ray.direction = glm::normalize( + cam.view - + cam.right * cam.pixelLength.x * ((float)x + uhh(rng) - (float)cam.resolution.x * 0.5f) - + cam.up * cam.pixelLength.y * ((float)y + uhh(rng) - (float)cam.resolution.y * 0.5f)); + + segment.pixelIndex = index; + segment.remainingBounces = traceDepth; + } +} + +__global__ void computeIntersections( + int depth, + int num_paths, + PathSegment * pathSegments, + Geom * geoms, + int geoms_size, + ShadeableIntersection * intersections) +{ + int path_index = blockIdx.x * blockDim.x + threadIdx.x; + + if (path_index < num_paths) + { + PathSegment pathSegment = pathSegments[path_index]; + + float t; + glm::vec3 intersect_point; + glm::vec3 normal; + float t_min = FLT_MAX; + int hit_geom_index = -1; + bool outside = true; + + glm::vec3 tmp_intersect; + glm::vec3 tmp_normal; + + // naive parse through global geoms + + for (int i = 0; i < geoms_size; i++) + { + Geom & geom = geoms[i]; + + if (geom.type == CUBE) + { + t = boxIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); + } + else if (geom.type == SPHERE) + { + t = sphereIntersectionTest(geom, pathSegment.ray, tmp_intersect, tmp_normal, outside); + } + + // Compute the minimum t from the intersection tests to determine what + // scene geometry object was hit first. + if (t > 0.0f && t_min > t) + { + t_min = t; + hit_geom_index = i; + intersect_point = tmp_intersect; + normal = tmp_normal; + } + } + + if (hit_geom_index == -1) + { + intersections[path_index].t = -1.0f; + } + else + { + //The ray hits something + intersections[path_index].t = t_min; + intersections[path_index].materialId = geoms[hit_geom_index].materialid; + intersections[path_index].surfaceNormal = normal; + } + } +} + +__global__ void shadeSimpleMaterials ( + int iter, + int num_paths, + ShadeableIntersection * shadeableIntersections, + PathSegment * pathSegments, + Material * materials) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_paths) + { + ShadeableIntersection intersection = shadeableIntersections[idx]; + PathSegment segment = pathSegments[idx]; + if (segment.remainingBounces == 0) { + return; + } + + if (intersection.t > 0.0f) { // if the intersection exists... + segment.remainingBounces--; + // Set up the RNG + thrust::default_random_engine rng = makeSeededRandomEngine(iter, idx, segment.remainingBounces); + + Material material = materials[intersection.materialId]; + glm::vec3 materialColor = material.color; + + // If the material indicates that the object was a light, "light" the ray + if (material.emittance > 0.0f) { + segment.color *= (materialColor * material.emittance); + segment.remainingBounces = 0; + } + else { + glm::vec3 intersectPos = intersection.t * segment.ray.direction + segment.ray.origin; + scatterRay(segment, intersectPos, intersection.surfaceNormal, material, rng); + } + // If there was no intersection, color the ray black. + // Lots of renderers use 4 channel color, RGBA, where A = alpha, often + // used for opacity, in which case they can indicate "no opacity". + // This can be useful for post-processing and image compositing. + } else { + segment.color = glm::vec3(0.0f); + segment.remainingBounces = 0; + } + + pathSegments[idx] = segment; + } +} + +__global__ void generateGBuffer ( + int num_paths, + ShadeableIntersection* shadeableIntersections, + PathSegment* pathSegments, + GBufferPixel* gBuffer) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num_paths) + { + ShadeableIntersection& i = shadeableIntersections[index]; + Ray& r = pathSegments[index].ray; + + gBuffer[index].t = i.t; + gBuffer[index].pos = r.origin + i.t * r.direction; + gBuffer[index].nor = i.surfaceNormal; + + } +} + +// Add the current iteration's output to the overall image +__global__ void finalGather(int nPaths, glm::vec3 * image, PathSegment * iterationPaths) +{ + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + if (index < nPaths) + { + PathSegment iterationPath = iterationPaths[index]; + image[iterationPath.pixelIndex] += iterationPath.color; + } + + //showImage(uchar4 * pbo, int iter, bool ui_denoise) +} + +/** + * Wrapper for the __global__ call that sets up the kernel calls and does a ton + * of memory management + */ +int pathtrace(int frame, int iter, int lastIter) { + const int traceDepth = hst_scene->state.traceDepth; + const Camera &cam = hst_scene->state.camera; + const int pixelcount = cam.resolution.x * cam.resolution.y; + + // 2D block for generating ray from camera + const dim3 blockSize2d(8, 8); + const dim3 blocksPerGrid2d( + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + + // 1D block for path tracing + const int blockSize1d = 128; + + /////////////////////////////////////////////////////////////////////////// + + // Pathtracing Recap: + // * Initialize array of path rays (using rays that come out of the camera) + // * You can pass the Camera object to that kernel. + // * Each path ray must carry at minimum a (ray, color) pair, + // * where color starts as the multiplicative identity, white = (1, 1, 1). + // * This has already been done for you. + // * NEW: For the first depth, generate geometry buffers (gbuffers) + // * For each depth: + // * Compute an intersection in the scene for each path ray. + // A very naive version of this has been implemented for you, but feel + // free to add more primitives and/or a better algorithm. + // Currently, intersection distance is recorded as a parametric distance, + // t, or a "distance along the ray." t = -1.0 indicates no intersection. + // * Color is attenuated (multiplied) by reflections off of any object + // * Stream compact away all of the terminated paths. + // You may use either your implementation or `thrust::remove_if` or its + // cousins. + // * Note that you can't really use a 2D kernel launch any more - switch + // to 1D. + // * Shade the rays that intersected something or didn't bottom out. + // That is, color the ray by performing a color computation according + // to the shader, then generate a new ray to continue the ray path. + // We recommend just updating the ray's PathSegment in place. + // Note that this step may come before or after stream compaction, + // since some shaders you write may also cause a path to terminate. + // * Finally: + // * if not denoising, add this iteration's results to the image + // * TODO: if denoising, run kernels that take both the raw pathtraced result and the gbuffer, and put the result in the "pbo" from opengl + + generateRayFromCamera <<>>(cam, iter, traceDepth, dev_paths); + checkCUDAError("generate camera ray"); + + int depth = 0; + PathSegment* dev_path_end = dev_paths + pixelcount; + int num_paths = dev_path_end - dev_paths; + int ret = (lastIter) ? num_paths : -1; + + // --- PathSegment Tracing Stage --- + // Shoot ray into scene, bounce between objects, push shading chunks + + // Empty gbuffer + cudaMemset(dev_gBuffer, 0, pixelcount * sizeof(GBufferPixel)); + + // clean shading chunks + cudaMemset(dev_intersections, 0, pixelcount * sizeof(ShadeableIntersection)); + + bool iterationComplete = false; + while (!iterationComplete) { + + dim3 numblocksPathSegmentTracing = (num_paths + blockSize1d - 1) / blockSize1d; + computeIntersections <<>> ( + depth, + num_paths, + dev_paths, + dev_geoms, + hst_scene->geoms.size(), + dev_intersections); + checkCUDAError("trace one bounce"); + cudaDeviceSynchronize(); + + if (depth == 0 && lastIter) { + generateGBuffer<<>>(num_paths, dev_intersections, dev_paths, dev_gBuffer); + } + depth++; + + shadeSimpleMaterials<<>> ( + iter, + num_paths, + dev_intersections, + dev_paths, + dev_materials); + iterationComplete = depth == traceDepth; + + } + + // Assemble this iteration and apply it to the image + dim3 numBlocksPixels = (pixelcount + blockSize1d - 1) / blockSize1d; + finalGather<<>>(num_paths, dev_image, dev_paths); + /////////////////////////////////////////////////////////////////////////// + + // CHECKITOUT: use dev_image as reference if you want to implement saving denoised images. + // Otherwise, screenshots are also acceptable. + // Retrieve image from GPU + cudaMemcpy(hst_scene->state.image.data(), dev_image, + pixelcount * sizeof(glm::vec3), cudaMemcpyDeviceToHost); + + checkCUDAError("pathtrace"); + cudaDeviceSynchronize(); + return ret; +} + +// CHECKITOUT: this kernel "post-processes" the gbuffer/gbuffers into something that you can visualize for debugging. +void showGBuffer(uchar4* pbo) { + const Camera &cam = hst_scene->state.camera; + const dim3 blockSize2d(8, 8); + const dim3 blocksPerGrid2d( + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + + // CHECKITOUT: process the gbuffer results and send them to OpenGL buffer for visualization + gbufferToPBO<<>>(pbo, cam.resolution, dev_gBuffer); +} + +void showImage(uchar4* pbo, int iter, bool ui_denoise) { +const Camera &cam = hst_scene->state.camera; + const dim3 blockSize2d(8, 8); + const dim3 blocksPerGrid2d( + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + + // Send results to OpenGL buffer for rendering + if (ui_denoise) { + sendImageToPBO << > > (pbo, cam.resolution, iter, dev_imageDenoise); + } + else { + sendImageToPBO << > > (pbo, cam.resolution, iter, dev_image); + } +} + +__global__ void denoiseIter( + const Camera cam, + const int step, + const float c_phi, + const float p_phi, + const float n_phi, + const float* gaussian, + glm::vec3* imageDenoise, + const glm::vec3* image, + const GBufferPixel* gBuffer) +{ + int x = (blockIdx.x * blockDim.x) + threadIdx.x; + int y = (blockIdx.y * blockDim.y) + threadIdx.y; + + if (x < cam.resolution.x && y < cam.resolution.y) { + + glm::vec3 sumColor = glm::vec3(0.f); + float sumWeight = 0.f; + + int index = x + (y * cam.resolution.x); + + glm::vec3 col = image[index]; + glm::vec3 pos = gBuffer[index].pos; + glm::vec3 nor = gBuffer[index].nor; + + for (int j = -2; j <= 2; j++) { + + //int relY = glm::clamp(y + j * step, 0, cam.resolution.y); + int relY = y + j * step; + if (relY < 0 || relY >= cam.resolution.y) continue; + + for (int i = -2; i <= 2; i++) { + + //int relX = glm::clamp(x + i * step, 0, cam.resolution.x); + int relX = x + i * step; + if (relX < 0 || relX >= cam.resolution.x) continue; + + int relIndex = relX + cam.resolution.x * relY; + + glm::vec3 t; glm::vec3 colTemp; float dist2; + // Color weighting + colTemp = image[relIndex]; + t = col - colTemp; + dist2 = glm::dot(t, t); + float c_w = glm::min(std::exp(-(dist2) / (c_phi + EPSILON)), 1.f); + + // Position weighting + t = pos - gBuffer[relIndex].pos; + dist2 = glm::dot(t, t); + float p_w = glm::min(std::exp(-(dist2) / (p_phi + EPSILON)), 1.f); + + // Normal weighting + t = nor - gBuffer[relIndex].nor; + dist2 = glm::max(glm::dot(t, t), 0.f); + float n_w = glm::min(std::exp(-(dist2) / (n_phi + EPSILON)), 1.f); + + //float weight = c_w * c_w * p_w * p_w * n_w * n_w; + float weight = c_w * p_w * n_w; + float influence = weight * gaussian[((i + 2) + 5 * (j + 2))]; + sumColor += (colTemp * influence); + sumWeight += influence; + } + } + + imageDenoise[index] = sumColor / sumWeight; + + } +} + +void denoise(const int filterSize, const float cPhi, const float pPhi, const float nPhi) { + + if (filterSize < 25) return; + + const Camera& cam = hst_scene->state.camera; + + const dim3 blockSize2d(BLOCK_LENGTH, BLOCK_LENGTH); + const dim3 blocksPerGrid2d( + (cam.resolution.x + blockSize2d.x - 1) / blockSize2d.x, + (cam.resolution.y + blockSize2d.y - 1) / blockSize2d.y); + + cudaMemcpy(dev_imageDenoiseDup, dev_image, cam.resolution.x * cam.resolution.y * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + + for (int step = 1; step <= std::floor(std::sqrt(filterSize)); step++) { + denoiseIter << > > (cam, 1 << step, cPhi, pPhi, nPhi, dev_gaussian, dev_imageDenoise, dev_imageDenoiseDup, dev_gBuffer); + std::swap(dev_imageDenoise, dev_imageDenoiseDup); + } + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/src/pathtrace.h b/src/pathtrace.h index 9e12f44..5a26cdf 100644 --- a/src/pathtrace.h +++ b/src/pathtrace.h @@ -5,6 +5,9 @@ void pathtraceInit(Scene *scene); void pathtraceFree(); -void pathtrace(int frame, int iteration); +int pathtrace(int frame, int iteration, int lastIter); void showGBuffer(uchar4 *pbo); -void showImage(uchar4 *pbo, int iter); +void showImage(uchar4 *pbo, int iter, bool ui_denoise); + +void denoise(const int filterSize, const float cPhi, const float pPhi, const float nPhi); + diff --git a/src/sceneStructs.h b/src/sceneStructs.h index da7e558..7f1ddcb 100644 --- a/src/sceneStructs.h +++ b/src/sceneStructs.h @@ -78,5 +78,8 @@ struct ShadeableIntersection { // CHECKITOUT - a simple struct for storing scene geometry information per-pixel. // What information might be helpful for guiding a denoising filter? struct GBufferPixel { - float t; + float t; + glm::vec3 pos; // Positions + glm::vec3 nor; // Normals + // Optional diffuse determined from pathtrace.cu's finalGather }; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu new file mode 100644 index 0000000..062427d --- /dev/null +++ b/stream_compaction/common.cu @@ -0,0 +1,40 @@ +#include "common.h" + + +namespace StreamCompaction { + namespace Common { + + /** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ + __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata) { + // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + int result = idata[index] != 0; + bools1[index] = result; + bools2[index] = result; + } + + /** + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. + */ + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices) { + // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) { + return; + } + + if (bools[index]) { + odata[indices[index]] = idata[index]; + } + } + } +} diff --git a/stream_compaction/common.h b/stream_compaction/common.h new file mode 100644 index 0000000..74fe808 --- /dev/null +++ b/stream_compaction/common.h @@ -0,0 +1,129 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define useCommon 1 + +/** + * Check for CUDA errors; print and exit if there was a problem. + */ +inline int ilog2(int x) { + int lg = 0; + while (x >>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return x == 1 ? 0 : ilog2(x - 1) + 1; +} + +namespace StreamCompaction { + namespace Common { + __global__ void kernMapToBoolean(int n, int *bools1, int *bools2, const int *idata); + + __global__ void kernScatter(int n, int *odata, + const int *idata, const int *bools, const int *indices); + + /** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ + class PerformanceTimer + { + public: + PerformanceTimer() + { + cudaEventCreate(&event_start); + cudaEventCreate(&event_end); + } + + ~PerformanceTimer() + { + cudaEventDestroy(event_start); + cudaEventDestroy(event_end); + } + + void startCpuTimer() + { + if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } + cpu_timer_started = true; + + time_start_cpu = std::chrono::high_resolution_clock::now(); + } + + void endCpuTimer() + { + time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } + + std::chrono::duration duro = time_end_cpu - time_start_cpu; + prev_elapsed_time_cpu_milliseconds = + static_cast(duro.count()); + + cpu_timer_started = false; + } + + void startGpuTimer() + { + if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } + gpu_timer_started = true; + + cudaEventRecord(event_start); + } + + void endGpuTimer() + { + cudaEventRecord(event_end); + cudaEventSynchronize(event_end); + + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } + + cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); + gpu_timer_started = false; + } + + float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 + { + return prev_elapsed_time_cpu_milliseconds; + } + + float getGpuElapsedTimeForPreviousOperation() //noexcept + { + return prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + + private: + cudaEvent_t event_start = nullptr; + cudaEvent_t event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t time_start_cpu; + time_point_t time_end_cpu; + + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + float prev_elapsed_time_cpu_milliseconds = 0.f; + float prev_elapsed_time_gpu_milliseconds = 0.f; + }; + } +}