Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ee6b6a7
Make the `process_closure` functions iterative, rather than recursive.
tgrant-nv Oct 2, 2024
d9c845f
Use ID-based dispatch for get_albedo/eval/sample.
tgrant-nv Oct 2, 2024
8ace2ab
Enable pathtracing in OptiX mode.
tgrant-nv Oct 3, 2024
2552f8c
Add a padding field to the BSDF struct to avoid a misaligned address …
tgrant-nv Oct 4, 2024
5a7ade3
Update the reference images for the existing OptiX tests. Remove the …
tgrant-nv Oct 3, 2024
987c9f4
Enable the render-* tests for OptiX. Add alternative reference images…
tgrant-nv Oct 4, 2024
bd99c26
clang-format.
tgrant-nv Oct 7, 2024
096fab4
Don't need to pass the ShaderGlobals to Scene::intersect.
tgrant-nv Oct 7, 2024
6d10195
Don't use TraceData, just use payload registers. Don't use designated…
tgrant-nv Oct 7, 2024
35eb193
clang-format
tgrant-nv Oct 30, 2024
7a94afb
Fix the pixel offset in the "no jitter" case. Adjust the reference im…
tgrant-nv Oct 30, 2024
0c42c12
Eliminate vec_math.h.
tgrant-nv Oct 30, 2024
6a1052e
Get rid of the shading.h include.
tgrant-nv Oct 30, 2024
f2df957
Add a note about the single-warp requirement in prepare_cuda().
tgrant-nv Oct 30, 2024
d43b48c
Use the integer representation to nudge tmin instead of a fixed epsil…
tgrant-nv Oct 30, 2024
c1acb68
Remove the unneeded half.h include.
tgrant-nv Oct 31, 2024
daf816e
Remove unneeded defines for the primitive hit types.
tgrant-nv Nov 4, 2024
46e9b76
Wrap the cudaMalloc and cudaMemcpy calls.
tgrant-nv Nov 6, 2024
3f511f9
Rename ref images with the usual convention
lgritz Nov 12, 2024
a5f418b
Address platform-to-platform test result variation
lgritz Nov 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/cmake/testing.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ macro ( TESTSUITE )
AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME"
AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION")
# Unoptimized
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY")
if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY"
AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY")
add_one_testsuite ("${_testname}.optix" "${_testsrcdir}"
ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 )
endif ()
Expand Down
6 changes: 5 additions & 1 deletion src/include/OSL/platform.h
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,11 @@
/// to use regular assert() for this purpose if you need to eliminate the
/// dependency on this header from a particular place (and don't mind that
/// assert won't format identically on all platforms).
#ifndef NDEBUG
///
/// These macros are no-ops when compiling for CUDA because they were found
/// to cause strange issues in device code (e.g., function bodies being
/// eliminated when OSL_DASSERT is used).
#if !defined(NDEBUG) && !defined(__CUDACC__)
# define OSL_DASSERT OSL_ASSERT
# define OSL_DASSERT_MSG OSL_ASSERT_MSG
#else
Expand Down
20 changes: 12 additions & 8 deletions src/testrender/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
list (APPEND testrender_srcs optixraytracer.cpp)
set (testrender_cuda_srcs
cuda/optix_raytracer.cu
cuda/wrapper.cu
)

set (testrender_rend_lib_srcs
Expand All @@ -25,17 +24,22 @@ if (OSL_USE_OPTIX)
)

# We need to make sure that the PTX files are regenerated whenever these
# headers change.
# files change.
set (testrender_cuda_headers
cuda/rend_lib.h
render_params.h)

set ( extra_cuda_headers
render_params.h )
background.h
optics.h
render_params.h
raytracer.h
sampling.h
shading.h
shading.cpp
simpleraytracer.cpp
)

# Generate PTX for all of the CUDA files
foreach (cudasrc ${testrender_cuda_srcs})
NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" )
NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
list (APPEND ptx_list ${ptx_generated})
endforeach ()

Expand All @@ -55,7 +59,7 @@ if (OSL_USE_OPTIX)
list (APPEND ptx_list ${rend_lib_ptx})

add_custom_target (testrender_ptx ALL
DEPENDS ${ptx_list}
DEPENDS ${ptx_list} ${testrender_cuda_headers}
SOURCES ${testrender_cuda_srcs} )

# Install the PTX files in a fixed location so that they can be
Expand Down
154 changes: 141 additions & 13 deletions src/testrender/background.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,48 @@

OSL_NAMESPACE_ENTER


// std::upper_bound is not supported in device code, so define a version of it here.
// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
template<typename T>
inline OSL_HOSTDEVICE const T*
upper_bound(const T* data, int count, const T value)
{
const T* first = data;
const T value_ = value;
int len = count;
while (len != 0) {
int l2 = len / 2;
const T* m = first;
m += l2;
if (value_ < *m)
len = l2;
else {
first = ++m;
len -= l2 + 1;
}
}
return first;
}


struct Background {
OSL_HOSTDEVICE
Background() : values(0), rows(0), cols(0) {}

OSL_HOSTDEVICE
~Background()
{
#ifndef __CUDACC__
delete[] values;
delete[] rows;
delete[] cols;
#endif
}

template<typename F, typename T> void prepare(int resolution, F cb, T* data)
{
// These values are set via set_variables() in CUDA
res = resolution;
if (res < 32)
res = 32; // validate
Expand All @@ -29,6 +60,7 @@ struct Background {
values = new Vec3[res * res];
rows = new float[res];
cols = new float[res * res];

for (int y = 0, i = 0; y < res; y++) {
for (int x = 0; x < res; x++, i++) {
values[i] = cb(map(x + 0.5f, y + 0.5f), data);
Expand All @@ -43,8 +75,9 @@ struct Background {
cols[i - res + x] /= cols[i - 1];
}
// normalize the pdf across all scanlines
for (int y = 0; y < res; y++)
for (int y = 0; y < res; y++) {
rows[y] /= rows[res - 1];
}

// both eval and sample below return a "weight" that is
// value[i] / row*col_pdf, so might as well bake it into the table
Expand All @@ -65,6 +98,7 @@ struct Background {
#endif
}

OSL_HOSTDEVICE
Vec3 eval(const Vec3& dir, float& pdf) const
{
// map from sphere to unit-square
Expand All @@ -90,6 +124,7 @@ struct Background {
return values[i];
}

OSL_HOSTDEVICE
Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
{
float row_pdf, col_pdf;
Expand All @@ -101,8 +136,98 @@ struct Background {
return values[y * res + x];
}

#ifdef __CUDACC__
OSL_HOSTDEVICE
void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
int res_in)
{
values = values_in;
rows = rows_in;
cols = cols_in;
res = res_in;
invres = __frcp_rn(res);
invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
assert(res >= 32);
}

template<typename F>
OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
{
// N.B. This needs to run on a single-warp launch, since there is no
// synchronization across warps in OptiX.
prepare_cuda_01(stride, idx, cb);
if (idx == 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe leave a comment here as well that this is running on a single warp? At first it wasn't clear to me how you can get away with no synchronization -- but it makes sense if there's only a single warp here.

prepare_cuda_02();
prepare_cuda_03(stride, idx);
}

// Pre-compute the 'values' table in parallel
template<typename F>
OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
{
for (int y = 0; y < res; y++) {
const int row_start = y * res;
const int row_end = row_start + res;
int i = row_start + idx;
for (int x = idx; x < res; x += stride, i += stride) {
if (i >= row_end)
continue;
values[i] = cb(map(x + 0.5f, y + 0.5f));
}
}
}

// Compute 'cols' and 'rows' using a single thread
OSL_HOSTDEVICE void prepare_cuda_02()
{
for (int y = 0, i = 0; y < res; y++) {
for (int x = 0; x < res; x++, i++) {
cols[i] = std::max(std::max(values[i].x, values[i].y),
values[i].z)
+ ((x > 0) ? cols[i - 1] : 0.0f);
}
rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
// normalize the pdf for this scanline (if it was non-zero)
if (cols[i - 1] > 0) {
for (int x = 0; x < res; x++) {
cols[i - res + x] = __fdiv_rn(cols[i - res + x],
cols[i - 1]);
}
}
}
}

// Normalize the row PDFs and finalize the 'values' table
OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
{
// normalize the pdf across all scanlines
for (int y = idx; y < res; y += stride) {
rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
}

// both eval and sample below return a "weight" that is
// value[i] / row*col_pdf, so might as well bake it into the table
for (int y = 0; y < res; y++) {
float row_pdf = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
const int row_start = y * res;
const int row_end = row_start + res;
int i = row_start + idx;
for (int x = idx; x < res; x += stride, i += stride) {
if (i >= row_end)
continue;
float col_pdf = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
invjacobian);
values[i].x = __fdiv_rn(values[i].x, divisor);
values[i].y = __fdiv_rn(values[i].y, divisor);
values[i].z = __fdiv_rn(values[i].z, divisor);
}
}
}
#endif

private:
Dual2<Vec3> map(float x, float y) const
OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
{
// pixel coordinates of entry (x,y)
Dual2<float> u = Dual2<float>(x, 1, 0) * invres;
Expand All @@ -115,14 +240,16 @@ struct Background {
return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
}

static float sample_cdf(const float* data, unsigned int n, float x,
unsigned int* idx, float* pdf)
static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
float x, unsigned int* idx,
float* pdf)
{
OSL_DASSERT(x >= 0);
OSL_DASSERT(x < 1);
*idx = std::upper_bound(data, data + n, x) - data;
OSL_DASSERT(x >= 0.0f);
OSL_DASSERT(x < 1.0f);
*idx = OSL::upper_bound(data, n, x) - data;
OSL_DASSERT(*idx < n);
OSL_DASSERT(x < data[*idx]);

float scaled_sample;
if (*idx == 0) {
*pdf = data[0];
Expand All @@ -137,12 +264,13 @@ struct Background {
return std::min(scaled_sample, 0.99999994f);
}

Vec3* values; // actual map
float* rows; // probability of choosing a given row 'y'
float* cols; // probability of choosing a given column 'x', given that we've chosen row 'y'
int res; // resolution in pixels of the precomputed table
float invres; // 1 / resolution
float invjacobian;
Vec3* values = nullptr; // actual map
float* rows = nullptr; // probability of choosing a given row 'y'
float* cols
= nullptr; // probability of choosing a given column 'x', given that we've chosen row 'y'
int res = -1; // resolution in pixels of the precomputed table
float invres = 0.0f; // 1 / resolution
float invjacobian = 0.0f;
};

OSL_NAMESPACE_EXIT
Loading