Skip to content

Commit bda7495

Browse files
authored
OptiX testrender overhaul (take two) (#1897)
This PR is a continuation of #1829, updated to include the recently added triangle mesh support. It enables full path tracing support for the OptiX backend in testrender. We have tried to share code between the CPU and OptiX backends where practical. There is more sharing in this PR than there was in #1829, which should reduce the maintenance burden a bit. ID-based dispatch Virtual function calls aren't well supported in OptiX, so rather than using regular C++ polymorphism to invoke the sample(), eval(), and get_albedo() functions for each of the BSDF sub-types, we manually invoke the correct function based on the closure ID (which we have added as a member of the BSDF class). ``` #define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast<const BSDF_TYPE*>(bsdf) OSL_HOSTDEVICE Color3 CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const { Color3 albedo(0); switch (bsdf->id) { case DIFFUSE_ID: albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo); break; case TRANSPARENT_ID: case MX_TRANSPARENT_ID: albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo); break; ``` Iterative closure evaluation Another key change is the non-recursive closure evaluation. We apply the same style of iterative tree traversal used in the previous OptiX version of process_closure() to the shared implementations of process_closure(), evaluate_layer_opacity(), process_medium_closure(), and process_background_closure(). Background sampling We've included support for background closures. This includes an OptiX implementation of the Background::prepare() function. We've broken that function into three phases, where phases 1 and 3 are parallelized across a warp and phase 2 is executed on a single thread. This offers a decent speedup over a single-threaded implementation without the complexity of a more sophisticated implementation. ``` // from background.h template<typename F> OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb) { prepare_cuda_01(stride, idx, cb); if (idx == 0) prepare_cuda_02(); prepare_cuda_03(stride, idx); } ``` Tests I have enabled the render-* tests for OptiX mode. I've added alternative reference images, since the GPU output exceeds the difference threshold on many of the tests. But in most cases the difference between the CPU and GPU output is very small. --------- Signed-off-by: Tim Grant <[email protected]>
1 parent 0d3e9d2 commit bda7495

File tree

91 files changed

+2293
-1446
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+2293
-1446
lines changed

src/cmake/testing.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,8 @@ macro ( TESTSUITE )
179179
AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME"
180180
AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION")
181181
# Unoptimized
182-
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY")
182+
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY"
183+
AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY")
183184
add_one_testsuite ("${_testname}.optix" "${_testsrcdir}"
184185
ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 )
185186
endif ()

src/include/OSL/platform.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,11 @@
481481
/// to use regular assert() for this purpose if you need to eliminate the
482482
/// dependency on this header from a particular place (and don't mind that
483483
/// assert won't format identically on all platforms).
484-
#ifndef NDEBUG
484+
///
485+
/// These macros are no-ops when compiling for CUDA because they were found
486+
/// to cause strange issues in device code (e.g., function bodies being
487+
/// eliminated when OSL_DASSERT is used).
488+
#if !defined(NDEBUG) && !defined(__CUDACC__)
485489
# define OSL_DASSERT OSL_ASSERT
486490
# define OSL_DASSERT_MSG OSL_ASSERT_MSG
487491
#else

src/testrender/CMakeLists.txt

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
1616
list (APPEND testrender_srcs optixraytracer.cpp)
1717
set (testrender_cuda_srcs
1818
cuda/optix_raytracer.cu
19-
cuda/wrapper.cu
2019
)
2120

2221
set (testrender_rend_lib_srcs
@@ -25,17 +24,22 @@ if (OSL_USE_OPTIX)
2524
)
2625

2726
# We need to make sure that the PTX files are regenerated whenever these
28-
# headers change.
27+
# files change.
2928
set (testrender_cuda_headers
3029
cuda/rend_lib.h
31-
render_params.h)
32-
33-
set ( extra_cuda_headers
34-
render_params.h )
30+
background.h
31+
optics.h
32+
render_params.h
33+
raytracer.h
34+
sampling.h
35+
shading.h
36+
shading.cpp
37+
simpleraytracer.cpp
38+
)
3539

3640
# Generate PTX for all of the CUDA files
3741
foreach (cudasrc ${testrender_cuda_srcs})
38-
NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" )
42+
NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
3943
list (APPEND ptx_list ${ptx_generated})
4044
endforeach ()
4145

@@ -55,7 +59,7 @@ if (OSL_USE_OPTIX)
5559
list (APPEND ptx_list ${rend_lib_ptx})
5660

5761
add_custom_target (testrender_ptx ALL
58-
DEPENDS ${ptx_list}
62+
DEPENDS ${ptx_list} ${testrender_cuda_headers}
5963
SOURCES ${testrender_cuda_srcs} )
6064

6165
# Install the PTX files in a fixed location so that they can be

src/testrender/background.h

Lines changed: 141 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,48 @@
1010

1111
OSL_NAMESPACE_ENTER
1212

13+
14+
// std::upper_bound is not supported in device code, so define a version of it here.
15+
// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
16+
template<typename T>
17+
inline OSL_HOSTDEVICE const T*
18+
upper_bound(const T* data, int count, const T value)
19+
{
20+
const T* first = data;
21+
const T value_ = value;
22+
int len = count;
23+
while (len != 0) {
24+
int l2 = len / 2;
25+
const T* m = first;
26+
m += l2;
27+
if (value_ < *m)
28+
len = l2;
29+
else {
30+
first = ++m;
31+
len -= l2 + 1;
32+
}
33+
}
34+
return first;
35+
}
36+
37+
1338
struct Background {
39+
OSL_HOSTDEVICE
1440
Background() : values(0), rows(0), cols(0) {}
41+
42+
OSL_HOSTDEVICE
1543
~Background()
1644
{
45+
#ifndef __CUDACC__
1746
delete[] values;
1847
delete[] rows;
1948
delete[] cols;
49+
#endif
2050
}
2151

2252
template<typename F, typename T> void prepare(int resolution, F cb, T* data)
2353
{
54+
// These values are set via set_variables() in CUDA
2455
res = resolution;
2556
if (res < 32)
2657
res = 32; // validate
@@ -29,6 +60,7 @@ struct Background {
2960
values = new Vec3[res * res];
3061
rows = new float[res];
3162
cols = new float[res * res];
63+
3264
for (int y = 0, i = 0; y < res; y++) {
3365
for (int x = 0; x < res; x++, i++) {
3466
values[i] = cb(map(x + 0.5f, y + 0.5f), data);
@@ -43,8 +75,9 @@ struct Background {
4375
cols[i - res + x] /= cols[i - 1];
4476
}
4577
// normalize the pdf across all scanlines
46-
for (int y = 0; y < res; y++)
78+
for (int y = 0; y < res; y++) {
4779
rows[y] /= rows[res - 1];
80+
}
4881

4982
// both eval and sample below return a "weight" that is
5083
// value[i] / row*col_pdf, so might as well bake it into the table
@@ -65,6 +98,7 @@ struct Background {
6598
#endif
6699
}
67100

101+
OSL_HOSTDEVICE
68102
Vec3 eval(const Vec3& dir, float& pdf) const
69103
{
70104
// map from sphere to unit-square
@@ -90,6 +124,7 @@ struct Background {
90124
return values[i];
91125
}
92126

127+
OSL_HOSTDEVICE
93128
Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
94129
{
95130
float row_pdf, col_pdf;
@@ -101,8 +136,98 @@ struct Background {
101136
return values[y * res + x];
102137
}
103138

139+
#ifdef __CUDACC__
140+
OSL_HOSTDEVICE
141+
void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
142+
int res_in)
143+
{
144+
values = values_in;
145+
rows = rows_in;
146+
cols = cols_in;
147+
res = res_in;
148+
invres = __frcp_rn(res);
149+
invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
150+
assert(res >= 32);
151+
}
152+
153+
template<typename F>
154+
OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
155+
{
156+
// N.B. This needs to run on a single-warp launch, since there is no
157+
// synchronization across warps in OptiX.
158+
prepare_cuda_01(stride, idx, cb);
159+
if (idx == 0)
160+
prepare_cuda_02();
161+
prepare_cuda_03(stride, idx);
162+
}
163+
164+
// Pre-compute the 'values' table in parallel
165+
template<typename F>
166+
OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
167+
{
168+
for (int y = 0; y < res; y++) {
169+
const int row_start = y * res;
170+
const int row_end = row_start + res;
171+
int i = row_start + idx;
172+
for (int x = idx; x < res; x += stride, i += stride) {
173+
if (i >= row_end)
174+
continue;
175+
values[i] = cb(map(x + 0.5f, y + 0.5f));
176+
}
177+
}
178+
}
179+
180+
// Compute 'cols' and 'rows' using a single thread
181+
OSL_HOSTDEVICE void prepare_cuda_02()
182+
{
183+
for (int y = 0, i = 0; y < res; y++) {
184+
for (int x = 0; x < res; x++, i++) {
185+
cols[i] = std::max(std::max(values[i].x, values[i].y),
186+
values[i].z)
187+
+ ((x > 0) ? cols[i - 1] : 0.0f);
188+
}
189+
rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
190+
// normalize the pdf for this scanline (if it was non-zero)
191+
if (cols[i - 1] > 0) {
192+
for (int x = 0; x < res; x++) {
193+
cols[i - res + x] = __fdiv_rn(cols[i - res + x],
194+
cols[i - 1]);
195+
}
196+
}
197+
}
198+
}
199+
200+
// Normalize the row PDFs and finalize the 'values' table
201+
OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
202+
{
203+
// normalize the pdf across all scanlines
204+
for (int y = idx; y < res; y += stride) {
205+
rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
206+
}
207+
208+
// both eval and sample below return a "weight" that is
209+
// value[i] / row*col_pdf, so might as well bake it into the table
210+
for (int y = 0; y < res; y++) {
211+
float row_pdf = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
212+
const int row_start = y * res;
213+
const int row_end = row_start + res;
214+
int i = row_start + idx;
215+
for (int x = idx; x < res; x += stride, i += stride) {
216+
if (i >= row_end)
217+
continue;
218+
float col_pdf = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
219+
const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
220+
invjacobian);
221+
values[i].x = __fdiv_rn(values[i].x, divisor);
222+
values[i].y = __fdiv_rn(values[i].y, divisor);
223+
values[i].z = __fdiv_rn(values[i].z, divisor);
224+
}
225+
}
226+
}
227+
#endif
228+
104229
private:
105-
Dual2<Vec3> map(float x, float y) const
230+
OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
106231
{
107232
// pixel coordinates of entry (x,y)
108233
Dual2<float> u = Dual2<float>(x, 1, 0) * invres;
@@ -115,14 +240,16 @@ struct Background {
115240
return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
116241
}
117242

118-
static float sample_cdf(const float* data, unsigned int n, float x,
119-
unsigned int* idx, float* pdf)
243+
static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
244+
float x, unsigned int* idx,
245+
float* pdf)
120246
{
121-
OSL_DASSERT(x >= 0);
122-
OSL_DASSERT(x < 1);
123-
*idx = std::upper_bound(data, data + n, x) - data;
247+
OSL_DASSERT(x >= 0.0f);
248+
OSL_DASSERT(x < 1.0f);
249+
*idx = OSL::upper_bound(data, n, x) - data;
124250
OSL_DASSERT(*idx < n);
125251
OSL_DASSERT(x < data[*idx]);
252+
126253
float scaled_sample;
127254
if (*idx == 0) {
128255
*pdf = data[0];
@@ -137,12 +264,13 @@ struct Background {
137264
return std::min(scaled_sample, 0.99999994f);
138265
}
139266

140-
Vec3* values; // actual map
141-
float* rows; // probability of choosing a given row 'y'
142-
float* cols; // probability of choosing a given column 'x', given that we've chosen row 'y'
143-
int res; // resolution in pixels of the precomputed table
144-
float invres; // 1 / resolution
145-
float invjacobian;
267+
Vec3* values = nullptr; // actual map
268+
float* rows = nullptr; // probability of choosing a given row 'y'
269+
float* cols
270+
= nullptr; // probability of choosing a given column 'x', given that we've chosen row 'y'
271+
int res = -1; // resolution in pixels of the precomputed table
272+
float invres = 0.0f; // 1 / resolution
273+
float invjacobian = 0.0f;
146274
};
147275

148276
OSL_NAMESPACE_EXIT

0 commit comments

Comments
 (0)