diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake index a2de5e315..fd30c4199 100644 --- a/src/cmake/testing.cmake +++ b/src/cmake/testing.cmake @@ -179,7 +179,8 @@ macro ( TESTSUITE ) AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME" AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION") # Unoptimized - if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY") + if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY" + AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY") add_one_testsuite ("${_testname}.optix" "${_testsrcdir}" ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 ) endif () diff --git a/src/include/OSL/platform.h b/src/include/OSL/platform.h index 39fb4496c..33268e732 100644 --- a/src/include/OSL/platform.h +++ b/src/include/OSL/platform.h @@ -481,7 +481,11 @@ /// to use regular assert() for this purpose if you need to eliminate the /// dependency on this header from a particular place (and don't mind that /// assert won't format identically on all platforms). -#ifndef NDEBUG +/// +/// These macros are no-ops when compiling for CUDA because they were found +/// to cause strange issues in device code (e.g., function bodies being +/// eliminated when OSL_DASSERT is used). +#if !defined(NDEBUG) && !defined(__CUDACC__) # define OSL_DASSERT OSL_ASSERT # define OSL_DASSERT_MSG OSL_ASSERT_MSG #else diff --git a/src/testrender/CMakeLists.txt b/src/testrender/CMakeLists.txt index 590f26447..c4ab69e7a 100644 --- a/src/testrender/CMakeLists.txt +++ b/src/testrender/CMakeLists.txt @@ -16,7 +16,6 @@ if (OSL_USE_OPTIX) list (APPEND testrender_srcs optixraytracer.cpp) set (testrender_cuda_srcs cuda/optix_raytracer.cu - cuda/wrapper.cu ) set (testrender_rend_lib_srcs @@ -25,17 +24,22 @@ if (OSL_USE_OPTIX) ) # We need to make sure that the PTX files are regenerated whenever these - # headers change. + # files change. set (testrender_cuda_headers cuda/rend_lib.h - render_params.h) - - set ( extra_cuda_headers - render_params.h ) + background.h + optics.h + render_params.h + raytracer.h + sampling.h + shading.h + shading.cpp + simpleraytracer.cpp + ) # Generate PTX for all of the CUDA files foreach (cudasrc ${testrender_cuda_srcs}) - NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" ) + NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" ) list (APPEND ptx_list ${ptx_generated}) endforeach () @@ -55,7 +59,7 @@ if (OSL_USE_OPTIX) list (APPEND ptx_list ${rend_lib_ptx}) add_custom_target (testrender_ptx ALL - DEPENDS ${ptx_list} + DEPENDS ${ptx_list} ${testrender_cuda_headers} SOURCES ${testrender_cuda_srcs} ) # Install the PTX files in a fixed location so that they can be diff --git a/src/testrender/background.h b/src/testrender/background.h index ac196cf27..e6daa28af 100644 --- a/src/testrender/background.h +++ b/src/testrender/background.h @@ -10,17 +10,48 @@ OSL_NAMESPACE_ENTER + +// std::upper_bound is not supported in device code, so define a version of it here. +// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information. +template +inline OSL_HOSTDEVICE const T* +upper_bound(const T* data, int count, const T value) +{ + const T* first = data; + const T value_ = value; + int len = count; + while (len != 0) { + int l2 = len / 2; + const T* m = first; + m += l2; + if (value_ < *m) + len = l2; + else { + first = ++m; + len -= l2 + 1; + } + } + return first; +} + + struct Background { + OSL_HOSTDEVICE Background() : values(0), rows(0), cols(0) {} + + OSL_HOSTDEVICE ~Background() { +#ifndef __CUDACC__ delete[] values; delete[] rows; delete[] cols; +#endif } template void prepare(int resolution, F cb, T* data) { + // These values are set via set_variables() in CUDA res = resolution; if (res < 32) res = 32; // validate @@ -29,6 +60,7 @@ struct Background { values = new Vec3[res * res]; rows = new float[res]; cols = new float[res * res]; + for (int y = 0, i = 0; y < res; y++) { for (int x = 0; x < res; x++, i++) { values[i] = cb(map(x + 0.5f, y + 0.5f), data); @@ -43,8 +75,9 @@ struct Background { cols[i - res + x] /= cols[i - 1]; } // normalize the pdf across all scanlines - for (int y = 0; y < res; y++) + for (int y = 0; y < res; y++) { rows[y] /= rows[res - 1]; + } // both eval and sample below return a "weight" that is // value[i] / row*col_pdf, so might as well bake it into the table @@ -65,6 +98,7 @@ struct Background { #endif } + OSL_HOSTDEVICE Vec3 eval(const Vec3& dir, float& pdf) const { // map from sphere to unit-square @@ -90,6 +124,7 @@ struct Background { return values[i]; } + OSL_HOSTDEVICE Vec3 sample(float rx, float ry, Dual2& dir, float& pdf) const { float row_pdf, col_pdf; @@ -101,8 +136,98 @@ struct Background { return values[y * res + x]; } +#ifdef __CUDACC__ + OSL_HOSTDEVICE + void set_variables(Vec3* values_in, float* rows_in, float* cols_in, + int res_in) + { + values = values_in; + rows = rows_in; + cols = cols_in; + res = res_in; + invres = __frcp_rn(res); + invjacobian = __fdiv_rn(res * res, float(4 * M_PI)); + assert(res >= 32); + } + + template + OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb) + { + // N.B. This needs to run on a single-warp launch, since there is no + // synchronization across warps in OptiX. + prepare_cuda_01(stride, idx, cb); + if (idx == 0) + prepare_cuda_02(); + prepare_cuda_03(stride, idx); + } + + // Pre-compute the 'values' table in parallel + template + OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb) + { + for (int y = 0; y < res; y++) { + const int row_start = y * res; + const int row_end = row_start + res; + int i = row_start + idx; + for (int x = idx; x < res; x += stride, i += stride) { + if (i >= row_end) + continue; + values[i] = cb(map(x + 0.5f, y + 0.5f)); + } + } + } + + // Compute 'cols' and 'rows' using a single thread + OSL_HOSTDEVICE void prepare_cuda_02() + { + for (int y = 0, i = 0; y < res; y++) { + for (int x = 0; x < res; x++, i++) { + cols[i] = std::max(std::max(values[i].x, values[i].y), + values[i].z) + + ((x > 0) ? cols[i - 1] : 0.0f); + } + rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f); + // normalize the pdf for this scanline (if it was non-zero) + if (cols[i - 1] > 0) { + for (int x = 0; x < res; x++) { + cols[i - res + x] = __fdiv_rn(cols[i - res + x], + cols[i - 1]); + } + } + } + } + + // Normalize the row PDFs and finalize the 'values' table + OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx) + { + // normalize the pdf across all scanlines + for (int y = idx; y < res; y += stride) { + rows[y] = __fdiv_rn(rows[y], rows[res - 1]); + } + + // both eval and sample below return a "weight" that is + // value[i] / row*col_pdf, so might as well bake it into the table + for (int y = 0; y < res; y++) { + float row_pdf = rows[y] - (y > 0 ? rows[y - 1] : 0.0f); + const int row_start = y * res; + const int row_end = row_start + res; + int i = row_start + idx; + for (int x = idx; x < res; x += stride, i += stride) { + if (i >= row_end) + continue; + float col_pdf = cols[i] - (x > 0 ? cols[i - 1] : 0.0f); + const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf), + invjacobian); + values[i].x = __fdiv_rn(values[i].x, divisor); + values[i].y = __fdiv_rn(values[i].y, divisor); + values[i].z = __fdiv_rn(values[i].z, divisor); + } + } + } +#endif + private: - Dual2 map(float x, float y) const + OSL_HOSTDEVICE Dual2 map(float x, float y) const { // pixel coordinates of entry (x,y) Dual2 u = Dual2(x, 1, 0) * invres; @@ -115,14 +240,16 @@ struct Background { return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi); } - static float sample_cdf(const float* data, unsigned int n, float x, - unsigned int* idx, float* pdf) + static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n, + float x, unsigned int* idx, + float* pdf) { - OSL_DASSERT(x >= 0); - OSL_DASSERT(x < 1); - *idx = std::upper_bound(data, data + n, x) - data; + OSL_DASSERT(x >= 0.0f); + OSL_DASSERT(x < 1.0f); + *idx = OSL::upper_bound(data, n, x) - data; OSL_DASSERT(*idx < n); OSL_DASSERT(x < data[*idx]); + float scaled_sample; if (*idx == 0) { *pdf = data[0]; @@ -137,12 +264,13 @@ struct Background { return std::min(scaled_sample, 0.99999994f); } - Vec3* values; // actual map - float* rows; // probability of choosing a given row 'y' - float* cols; // probability of choosing a given column 'x', given that we've chosen row 'y' - int res; // resolution in pixels of the precomputed table - float invres; // 1 / resolution - float invjacobian; + Vec3* values = nullptr; // actual map + float* rows = nullptr; // probability of choosing a given row 'y' + float* cols + = nullptr; // probability of choosing a given column 'x', given that we've chosen row 'y' + int res = -1; // resolution in pixels of the precomputed table + float invres = 0.0f; // 1 / resolution + float invjacobian = 0.0f; }; OSL_NAMESPACE_EXIT diff --git a/src/testrender/cuda/optix_raytracer.cu b/src/testrender/cuda/optix_raytracer.cu index 631055f48..513dca223 100644 --- a/src/testrender/cuda/optix_raytracer.cu +++ b/src/testrender/cuda/optix_raytracer.cu @@ -4,16 +4,29 @@ #include - -#include "util.h" - #include #include -#include "../render_params.h" +#include "optix_raytracer.h" #include "rend_lib.h" +#include "../background.h" +#include "../bvh.h" +#include "../raytracer.h" +#include "../render_params.h" +#include "../sampling.h" +#include "../shading.cpp" + +#include + + +// Conversion macros for casting between vector types +#define F3_TO_V3(f3) (*reinterpret_cast(&f3)) +#define F3_TO_C3(f3) (*reinterpret_cast(&f3)) +#define V3_TO_F3(v3) (*reinterpret_cast(&v3)) +#define C3_TO_F3(c3) (*reinterpret_cast(&c3)) + OSL_NAMESPACE_ENTER namespace pvt { @@ -34,6 +47,171 @@ __device__ __constant__ RenderParams render_params; } +static inline __device__ void +execute_shader(ShaderGlobalsType& sg, const int shader_id, char* closure_pool) +{ + if (shader_id < 0) { + // TODO: should probably never get here ... + return; + } + + // Pack the "closure pool" into one of the ShaderGlobals pointers + *(int*)&closure_pool[0] = 0; + sg.renderstate = &closure_pool[0]; + + // Pack the pointers to the options structs in a faux "context", + // which is a rough stand-in for the host ShadingContext. + ShadingContextCUDA shading_context; + sg.context = &shading_context; + + // Run the OSL callable + void* interactive_ptr = reinterpret_cast( + render_params.interactive_params)[shader_id]; + const unsigned int shaderIdx = shader_id + 0u; + optixDirectCall( + shaderIdx, &sg /*shaderglobals_ptr*/, nullptr /*groupdata_ptr*/, + nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/, + 0 /*shadeindex - unused*/, interactive_ptr /*interactive_params_ptr*/ + ); +} + + +static inline __device__ void +trace_ray(OptixTraversableHandle handle, Payload& payload, const float3& origin, + const float3& direction, const float tmin) +{ + uint32_t p0 = static_cast(payload.hit_id); + uint32_t p1 = __float_as_uint(payload.hit_t); + uint32_t p2 = __float_as_uint(payload.hit_u); + uint32_t p3 = __float_as_uint(payload.hit_v); + optixTrace(handle, // handle + origin, // origin + direction, // direction + tmin, // tmin + 1e13f, // tmax + 0, // ray time + OptixVisibilityMask(1), // visibility mask + OPTIX_RAY_FLAG_DISABLE_ANYHIT, // ray flags + 0, // SBT offset + 1, // SBT stride + 0, // miss SBT offset + p0, p1, p2, p3); + payload.hit_id = static_cast(p0); + payload.hit_t = __uint_as_float(p1); + payload.hit_u = __uint_as_float(p2); + payload.hit_v = __uint_as_float(p3); +}; + + +Intersection +Scene::intersect(const Ray& r, const float tmax, const unsigned skipID1, + const unsigned /*skipID2*/) const +{ + // Trace the ray against the scene. If the ID for the hit matches skipID1, + // "nudge" the ray forward by adjusting tmin to exclude the hit interval + // and try again. + const int num_attempts = 2; + float tmin = 0.0f; + for (int attempt = 0; attempt < num_attempts; ++attempt) { + Payload payload; + payload.hit_id = ~0u; + trace_ray(handle, payload, V3_TO_F3(r.origin), V3_TO_F3(r.direction), + tmin); + if (payload.hit_id == skipID1) { + tmin = __uint_as_float(__float_as_uint(payload.hit_t) + 1u); + } else if (payload.hit_id != ~0u) { + return { payload.hit_t, payload.hit_u, payload.hit_v, + payload.hit_id }; + } + } + return { std::numeric_limits::infinity() }; +} + + +static inline __device__ void +setupRaytracer(SimpleRaytracer& raytracer, const bool bg_only) +{ + // Background + raytracer.background = {}; + raytracer.background.set_variables((Vec3*)render_params.bg_values, + (float*)render_params.bg_rows, + (float*)render_params.bg_cols, + render_params.bg_res); + + raytracer.backgroundResolution = render_params.bg_id >= 0 + ? render_params.bg_res + : 0; + raytracer.backgroundShaderID = render_params.bg_id; + + if (bg_only) + return; + + // Parameters + raytracer.aa = render_params.aa; + raytracer.no_jitter = render_params.no_jitter; + raytracer.max_bounces = render_params.max_bounces; + raytracer.rr_depth = 5; + raytracer.show_albedo_scale = render_params.show_albedo_scale; + raytracer.show_globals = render_params.show_globals; + + // Pointers + raytracer.lightprims_size = render_params.lightprims_size; + raytracer.m_lightprims = reinterpret_cast( + render_params.lightprims); + raytracer.m_mesh_surfacearea = reinterpret_cast( + render_params.surfacearea); + raytracer.m_meshids = reinterpret_cast(render_params.mesh_ids); + raytracer.m_shader_is_light = reinterpret_cast( + render_params.shader_is_light); + + // Scene + raytracer.scene = {}; + raytracer.scene.verts = reinterpret_cast( + render_params.verts); + raytracer.scene.normals = reinterpret_cast( + render_params.normals); + raytracer.scene.uvs = reinterpret_cast(render_params.uvs); + raytracer.scene.triangles = reinterpret_cast( + render_params.triangles); + raytracer.scene.uv_triangles = reinterpret_cast( + render_params.uv_indices); + raytracer.scene.n_triangles = reinterpret_cast( + render_params.normal_indices); + raytracer.scene.shaderids = reinterpret_cast( + render_params.shader_ids); + raytracer.scene.handle = render_params.traversal_handle; + + // Camera + const Vec3 eye = F3_TO_V3(render_params.eye); + const Vec3 dir = F3_TO_V3(render_params.dir); + const Vec3 up = F3_TO_V3(render_params.up); + const float fov = render_params.fov; + const uint3 launch_dims = optixGetLaunchDimensions(); + raytracer.camera.resolution(launch_dims.x, launch_dims.y); + raytracer.camera.lookat(eye, dir, up, fov); + raytracer.camera.finalize(); +} + +//------------------------------------------------------------------------------ + +// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics" +// let's do the texture look-ups in this file. +extern "C" __device__ float4 +osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, + float dsdy, float dtdy) +{ + const float2 dx = { dsdx, dtdx }; + const float2 dy = { dsdy, dtdy }; + cudaTextureObject_t texID = cudaTextureObject_t(handle); + return tex2DGrad(texID, s, t, dx, dy); +} + + +// +// OptiX Programs +// + + extern "C" __global__ void __miss__() { @@ -51,12 +229,32 @@ __miss__() extern "C" __global__ void __raygen__setglobals() { + uint3 launch_dims = optixGetLaunchDimensions(); + uint3 launch_index = optixGetLaunchIndex(); + // Set global variables - OSL::pvt::osl_printf_buffer_start = render_params.osl_printf_buffer_start; - OSL::pvt::osl_printf_buffer_end = render_params.osl_printf_buffer_end; - OSL::pvt::s_color_system = render_params.color_system; - OSL::pvt::test_str_1 = render_params.test_str_1; - OSL::pvt::test_str_2 = render_params.test_str_2; + if (launch_index.x == 0 && launch_index.y == 0) { + OSL::pvt::osl_printf_buffer_start + = render_params.osl_printf_buffer_start; + OSL::pvt::osl_printf_buffer_end = render_params.osl_printf_buffer_end; + OSL::pvt::s_color_system = render_params.color_system; + OSL::pvt::test_str_1 = render_params.test_str_1; + OSL::pvt::test_str_2 = render_params.test_str_2; + } + + if (render_params.bg_id < 0) + return; + + SimpleRaytracer raytracer; + setupRaytracer(raytracer, /*bg_only=*/true); + + auto evaler = [&](const Dual2& dir) { + return raytracer.eval_background(dir, nullptr); + }; + + // Background::prepare_cuda must run on a single warp + assert(launch_index.x < 32 && launch_index.y == 0); + raytracer.background.prepare_cuda(launch_dims.x, launch_index.x, evaler); } @@ -67,37 +265,50 @@ __miss__setglobals() extern "C" __global__ void -__raygen__() +__closesthit__deferred() { - uint3 launch_dims = optixGetLaunchDimensions(); - uint3 launch_index = optixGetLaunchIndex(); - const float3 eye = render_params.eye; - const float3 dir = render_params.dir; - const float3 cx = render_params.cx; - const float3 cy = render_params.cy; - const float invw = render_params.invw; - const float invh = render_params.invh; - - // Compute the pixel coordinates - const float2 d = make_float2(static_cast(launch_index.x) + 0.5f, - static_cast(launch_index.y) + 0.5f); - - // Make the ray for the current pixel - RayGeometry r; - r.origin = eye; - r.direction = normalize(cx * (d.x * invw - 0.5f) + cy * (0.5f - d.y * invh) - + dir); - optixTrace(render_params.traversal_handle, r.origin, r.direction, 1e-3f, - 1e13f, 0, OptixVisibilityMask(1), OPTIX_RAY_FLAG_DISABLE_ANYHIT, - 0, 1, 0); + const unsigned int hit_idx = optixGetPrimitiveIndex(); + const float3 ray_direction = optixGetWorldRayDirection(); + const float3 ray_origin = optixGetWorldRayOrigin(); + const float hit_t = optixGetRayTmax(); + const float2 barycentrics = optixGetTriangleBarycentrics(); + const float b1 = barycentrics.x; + const float b2 = barycentrics.y; + + Payload payload; + payload.hit_t = hit_t; + payload.hit_u = b1; + payload.hit_v = b2; + payload.hit_id = hit_idx; + payload.set(); } -// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics" -// let's do the texture look-ups in this file. -extern "C" __device__ float4 -osl_tex2DLookup(void* handle, float s, float t) +extern "C" __global__ void +__raygen__deferred() { - cudaTextureObject_t texID = cudaTextureObject_t(handle); - return tex2D(texID, s, t); + SimpleRaytracer raytracer; + setupRaytracer(raytracer, /*bg_only=*/false); + + const uint3 launch_index = optixGetLaunchIndex(); + Color3 result = raytracer.antialias_pixel(launch_index.x, launch_index.y, + nullptr); + + // Write the output + { + uint3 launch_dims = optixGetLaunchDimensions(); + uint3 launch_index = optixGetLaunchIndex(); + float3* output_buffer = reinterpret_cast( + render_params.output_buffer); + int pixel = launch_index.y * launch_dims.x + launch_index.x; + output_buffer[pixel] = C3_TO_F3(result); + } } + +//------------------------------------------------------------------------------ + +// We need to pull in the definition of SimpleRaytracer::subpixel_radiance(), +// which is shared between the host and CUDA renderers. +#include "../simpleraytracer.cpp" + +//------------------------------------------------------------------------------ diff --git a/src/testrender/cuda/optix_raytracer.h b/src/testrender/cuda/optix_raytracer.h new file mode 100644 index 000000000..34a8eef90 --- /dev/null +++ b/src/testrender/cuda/optix_raytracer.h @@ -0,0 +1,74 @@ +#pragma once + +#include +#include + +#include "../background.h" +#include "../raytracer.h" +#include "../sampling.h" +#include "rend_lib.h" + +#include + +#ifdef __CUDACC__ + +struct Payload { + uint32_t hit_id; + float hit_t; + float hit_u; + float hit_v; + + __forceinline__ __device__ void set() + { + optixSetPayload_0(hit_id); + optixSetPayload_1(__float_as_uint(hit_t)); + optixSetPayload_2(__float_as_uint(hit_u)); + optixSetPayload_3(__float_as_uint(hit_v)); + } + + __forceinline__ __device__ void get() + { + hit_id = static_cast(optixGetPayload_0()); + hit_t = __uint_as_float(optixGetPayload_1()); + hit_u = __uint_as_float(optixGetPayload_2()); + hit_v = __uint_as_float(optixGetPayload_3()); + } +}; + +OSL_NAMESPACE_ENTER + +struct SimpleRaytracer { + using ShadingContext = ShadingContextCUDA; + + Background background; + Camera camera; + Scene scene; + int aa = 1; + bool no_jitter = false; + int backgroundResolution = 1024; + int backgroundShaderID = -1; + int max_bounces = 1000000; + int rr_depth = 5; + float show_albedo_scale = 0.0f; + int show_globals = 0; + const int* m_shader_is_light = nullptr; + const unsigned* m_lightprims = nullptr; + size_t lightprims_size = 0; + const int* m_shaderids = nullptr; + const int* m_meshids = nullptr; + const float* m_mesh_surfacearea = nullptr; + + OSL_HOSTDEVICE void globals_from_hit(OSL_CUDA::ShaderGlobals& sg, + const Ray& r, const Dual2& t, + int id, float u, float v); + OSL_HOSTDEVICE Vec3 eval_background(const Dual2& dir, + ShadingContext* ctx, int bounce = -1); + OSL_HOSTDEVICE Color3 subpixel_radiance(float x, float y, Sampler& sampler, + ShadingContext* ctx = nullptr); + OSL_HOSTDEVICE Color3 antialias_pixel(int x, int y, + ShadingContext* ctx = nullptr); +}; + +OSL_NAMESPACE_EXIT + +#endif // #ifdef __CUDACC__ diff --git a/src/testrender/cuda/rend_lib.cu b/src/testrender/cuda/rend_lib.cu index 72ee029f3..fb51605e9 100644 --- a/src/testrender/cuda/rend_lib.cu +++ b/src/testrender/cuda/rend_lib.cu @@ -58,7 +58,7 @@ closure_component_allot(void* pool, int id, size_t prim_size, ((OSL::ClosureComponent*)pool)->id = id; ((OSL::ClosureComponent*)pool)->w = w; - size_t needed = (sizeof(OSL::ClosureComponent) - sizeof(void*) + prim_size + size_t needed = (sizeof(OSL::ClosureComponent) + prim_size + (alignof(OSL::ClosureComponent) - 1)) & ~(alignof(OSL::ClosureComponent) - 1); char* char_ptr = (char*)pool; @@ -120,7 +120,7 @@ closure_add_allot(void* pool, OSL::ClosureColor* a, OSL::ClosureColor* b) __device__ void* osl_allocate_closure_component(void* sg_, int id, int size) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; OSL::Color3 w = OSL::Color3(1, 1, 1); // Fix up the alignment @@ -141,7 +141,7 @@ __device__ void* osl_allocate_weighted_closure_component(void* sg_, int id, int size, const void* w) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; const OSL::Color3* wc = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float)); @@ -166,7 +166,7 @@ osl_allocate_weighted_closure_component(void* sg_, int id, int size, __device__ void* osl_mul_closure_color(void* sg_, void* a, const void* w) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; const OSL::Color3* wc = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float)); @@ -198,7 +198,7 @@ osl_mul_closure_float(void* sg_, void* a, float w) { a = __builtin_assume_aligned(a, alignof(float)); - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; if (a == NULL || w == 0.0f) { return NULL; @@ -226,7 +226,7 @@ osl_add_closure_closure(void* sg_, void* a, void* b) a = __builtin_assume_aligned(a, alignof(float)); b = __builtin_assume_aligned(b, alignof(float)); - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; if (a == NULL) { return b; @@ -383,9 +383,9 @@ osl_range_check_err(int indexvalue, int length, OSL::ustringhash_pod symname, __device__ int osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_) { - r = __builtin_assume_aligned(r, alignof(float)); - OSL::ustringhash from = OSL::ustringhash_from(from_); - ShaderGlobals* sg = (ShaderGlobals*)sg_; + r = __builtin_assume_aligned(r, alignof(float)); + OSL::ustringhash from = OSL::ustringhash_from(from_); + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; if (from == OSL::Hashes::common) { MAT(r).makeIdentity(); return true; @@ -428,9 +428,9 @@ osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_) __device__ int osl_get_inverse_matrix(void* sg_, void* r, OSL::ustringhash_pod to_) { - r = __builtin_assume_aligned(r, alignof(float)); - OSL::ustringhash to = OSL::ustringhash_from(to_); - ShaderGlobals* sg = (ShaderGlobals*)sg_; + r = __builtin_assume_aligned(r, alignof(float)); + OSL::ustringhash to = OSL::ustringhash_from(to_); + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; if (to == OSL::Hashes::common) { MAT(r).makeIdentity(); return true; diff --git a/src/testrender/cuda/rend_lib.h b/src/testrender/cuda/rend_lib.h index 1e0f9bcc8..d6aea2750 100644 --- a/src/testrender/cuda/rend_lib.h +++ b/src/testrender/cuda/rend_lib.h @@ -6,11 +6,11 @@ #include -#include - #include #include +#include "../raytracer.h" + OSL_NAMESPACE_ENTER @@ -45,19 +45,20 @@ namespace { // anonymous namespace struct ShadingContextCUDA {}; +namespace OSL_CUDA { struct ShaderGlobals { - float3 P, dPdx, dPdy; - float3 dPdz; - float3 I, dIdx, dIdy; - float3 N; - float3 Ng; + OSL::Vec3 P, dPdx, dPdy; + OSL::Vec3 dPdz; + OSL::Vec3 I, dIdx, dIdy; + OSL::Vec3 N; + OSL::Vec3 Ng; float u, dudx, dudy; float v, dvdx, dvdy; - float3 dPdu, dPdv; + OSL::Vec3 dPdu, dPdv; float time; float dtime; - float3 dPdtime; - float3 Ps, dPsdx, dPsdy; + OSL::Vec3 dPdtime; + OSL::Vec3 Ps, dPsdx, dPsdy; void* renderstate; void* tracedata; void* objdata; @@ -73,88 +74,7 @@ struct ShaderGlobals { int raytype; int flipHandedness; int backfacing; - int shaderID; -}; - - -enum RayType { - CAMERA = 1, - SHADOW = 2, - REFLECTION = 4, - REFRACTION = 8, - DIFFUSE = 16, - GLOSSY = 32, - SUBSURFACE = 64, - DISPLACEMENT = 128 }; - - -// Closures supported by the OSL sample renderer. This list is mostly aspirational. -enum ClosureIDs { - EMISSION_ID = 1, - BACKGROUND_ID, - DIFFUSE_ID, - OREN_NAYAR_ID, - TRANSLUCENT_ID, - PHONG_ID, - WARD_ID, - MICROFACET_ID, - REFLECTION_ID, - FRESNEL_REFLECTION_ID, - REFRACTION_ID, - TRANSPARENT_ID, - DEBUG_ID, - HOLDOUT_ID, -}; - - -// ======================================== -// -// Some helper vector functions -// -static __forceinline__ __device__ float3 -operator*(const float a, const float3& b) -{ - return make_float3(a * b.x, a * b.y, a * b.z); -} - -static __forceinline__ __device__ float3 -operator*(const float3& a, const float b) -{ - return make_float3(a.x * b, a.y * b, a.z * b); -} - -static __forceinline__ __device__ float3 -operator+(const float3& a, const float3& b) -{ - return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); -} - -static __forceinline__ __device__ float3 -operator-(const float3& a, const float3& b) -{ - return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); -} - -static __forceinline__ __device__ float3 -operator-(const float3& a) -{ - return make_float3(-a.x, -a.y, -a.z); -} - -static __forceinline__ __device__ float -dot(const float3& a, const float3& b) -{ - return a.x * b.x + a.y * b.y + a.z * b.z; -} - -static __forceinline__ __device__ float3 -normalize(const float3& v) -{ - float invLen = 1.0f / sqrtf(dot(v, v)); - return invLen * v; -} -// -// ======================================== +} // namespace OSL_CUDA } // anonymous namespace diff --git a/src/testrender/cuda/wrapper.cu b/src/testrender/cuda/wrapper.cu deleted file mode 100644 index a2501ff97..000000000 --- a/src/testrender/cuda/wrapper.cu +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright Contributors to the Open Shading Language project. -// SPDX-License-Identifier: BSD-3-Clause -// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage - -#include -#include - -#include -#include - -#include - -#include "rend_lib.h" -#include "util.h" - -#include "../render_params.h" - - -extern "C" { -__device__ __constant__ RenderParams render_params; -} - - -extern "C" __global__ void -__anyhit__any_hit_shadow() -{ - optixTerminateRay(); -} - - - -static __device__ void -globals_from_hit(ShaderGlobals& sg) -{ - // Setup the ShaderGlobals - const int primID = optixGetPrimitiveIndex(); - const float3 ray_direction = optixGetWorldRayDirection(); - const float3 ray_origin = optixGetWorldRayOrigin(); - const float t_hit = optixGetRayTmax(); - const int shader_id = reinterpret_cast( - render_params.shader_ids)[primID]; - - const OSL::Vec3* verts = reinterpret_cast( - render_params.verts); - const OSL::Vec3* normals = reinterpret_cast( - render_params.normals); - const OSL::Vec2* uvs = reinterpret_cast( - render_params.uvs); - const int3* triangles = reinterpret_cast( - render_params.triangles); - const int3* n_triangles = reinterpret_cast( - render_params.normal_indices); - const int3* uv_triangles = reinterpret_cast( - render_params.uv_indices); - const int* mesh_ids = reinterpret_cast(render_params.mesh_ids); - const float* surfacearea = reinterpret_cast( - render_params.surfacearea); - - // Calculate UV and its derivatives - const float2 barycentrics = optixGetTriangleBarycentrics(); - const float b1 = barycentrics.x; - const float b2 = barycentrics.y; - const float b0 = 1.0f - (b1 + b2); - - const OSL::Vec2 ta = uvs[uv_triangles[primID].x]; - const OSL::Vec2 tb = uvs[uv_triangles[primID].y]; - const OSL::Vec2 tc = uvs[uv_triangles[primID].z]; - const OSL::Vec2 uv = b0 * ta + b1 * tb + b2 * tc; - const float u = uv.x; - const float v = uv.y; - - const OSL::Vec3 va = verts[triangles[primID].x]; - const OSL::Vec3 vb = verts[triangles[primID].y]; - const OSL::Vec3 vc = verts[triangles[primID].z]; - - const OSL::Vec2 dt02 = ta - tc, dt12 = tb - tc; - const OSL::Vec3 dp02 = va - vc, dp12 = vb - vc; - - OSL::Vec3 dPdu, dPdv; - const float det = dt02.x * dt12.y - dt02.y * dt12.x; - if (det != 0.0f) { - float invdet = 1.0f / det; - dPdu = (dt12.y * dp02 - dt02.y * dp12) * invdet; - dPdv = (-dt12.x * dp02 + dt02.x * dp12) * invdet; - } - - // Calculate the normals - const OSL::Vec3 Ng = (va - vb).cross(va - vc).normalize(); - OSL::Vec3 N; - if (n_triangles[primID].x < 0.0f) { - N = Ng; - } else { - const OSL::Vec3 na = normals[n_triangles[primID].x]; - const OSL::Vec3 nb = normals[n_triangles[primID].y]; - const OSL::Vec3 nc = normals[n_triangles[primID].z]; - N = ((1 - u - v) * na + u * nb + v * nc).normalize(); - } - - sg.I = ray_direction; - sg.N = normalize( - optixTransformNormalFromObjectToWorldSpace(*(float3*)(&N))); - sg.Ng = normalize( - optixTransformNormalFromObjectToWorldSpace(*(float3*)(&Ng))); - sg.P = ray_origin + t_hit * ray_direction; - sg.dPdu = *(float3*)(&dPdu); - sg.dPdv = *(float3*)(&dPdv); - sg.u = u; - sg.v = v; - sg.Ci = NULL; - sg.surfacearea = surfacearea[mesh_ids[primID]]; - sg.backfacing = dot(sg.N, sg.I) > 0.0f; - sg.shaderID = shader_id; - - if (sg.backfacing) { - sg.N = -sg.N; - sg.Ng = -sg.Ng; - } - - // NB: These variables are not used in the current iteration of the sample - sg.raytype = CAMERA; - sg.flipHandedness = 0; -} - - - -static __device__ float3 -process_closure(const OSL::ClosureColor* closure_tree) -{ - OSL::Color3 result = OSL::Color3(0.0f); - - if (!closure_tree) { - return make_float3(result.x, result.y, result.z); - } - - // The depth of the closure tree must not exceed the stack size. - // A stack size of 8 is probably quite generous for relatively - // balanced trees. - const int STACK_SIZE = 8; - - // Non-recursive traversal stack - int stack_idx = 0; - const OSL::ClosureColor* ptr_stack[STACK_SIZE]; - OSL::Color3 weight_stack[STACK_SIZE]; - - // Shading accumulator - OSL::Color3 weight = OSL::Color3(1.0f); - - const void* cur = closure_tree; - while (cur) { - switch (((OSL::ClosureColor*)cur)->id) { - case OSL::ClosureColor::ADD: { - ptr_stack[stack_idx] = ((OSL::ClosureAdd*)cur)->closureB; - weight_stack[stack_idx++] = weight; - cur = ((OSL::ClosureAdd*)cur)->closureA; - break; - } - - case OSL::ClosureColor::MUL: { - weight *= ((OSL::ClosureMul*)cur)->weight; - cur = ((OSL::ClosureMul*)cur)->closure; - break; - } - - case EMISSION_ID: { - cur = NULL; - break; - } - - case DIFFUSE_ID: - case OREN_NAYAR_ID: - case PHONG_ID: - case WARD_ID: - case REFLECTION_ID: - case REFRACTION_ID: - case FRESNEL_REFLECTION_ID: { - result += ((OSL::ClosureComponent*)cur)->w * weight; - cur = NULL; - break; - } - - case MICROFACET_ID: { - const char* mem = (const char*)((OSL::ClosureComponent*)cur)->data(); - OSL::ustringhash dist_uh = *(OSL::ustringhash*)&mem[0]; - - if (dist_uh == OSL::Hashes::default_) - return make_float3(0.0f, 1.0f, 1.0f); - else - return make_float3(1.0f, 0.0f, 1.0f); - - break; - } - - default: cur = NULL; break; - } - - if (cur == NULL && stack_idx > 0) { - cur = ptr_stack[--stack_idx]; - weight = weight_stack[stack_idx]; - } - } - - return make_float3(result.x, result.y, result.z); -} - - - -extern "C" __global__ void -__closesthit__closest_hit_osl() -{ - // TODO: Fixed-sized allocations can easily be exceeded by arbitrary shader - // networks, so there should be (at least) some mechanism to issue a - // warning or error if the closure or param storage can possibly be - // exceeded. - alignas(8) char closure_pool[256]; - - ShaderGlobals sg; - globals_from_hit(sg); - - // Pack the "closure pool" into one of the ShaderGlobals pointers - *(int*)&closure_pool[0] = 0; - sg.renderstate = &closure_pool[0]; - - // Pack the pointers to the options structs in a faux "context", - // which is a rough stand-in for the host ShadingContext. - ShadingContextCUDA shading_context; - - sg.context = &shading_context; - - // Run the OSL callable - void* interactive_ptr = reinterpret_cast( - render_params.interactive_params)[sg.shaderID]; - const unsigned int shaderIdx = sg.shaderID + 0u; - optixDirectCall( - shaderIdx, &sg /*shaderglobals_ptr*/, nullptr /*groupdata_ptr*/, - nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/, - 0 /*shadeindex - unused*/, interactive_ptr /*interactive_params_ptr*/); - - float3 result = process_closure((OSL::ClosureColor*)sg.Ci); - uint3 launch_dims = optixGetLaunchDimensions(); - uint3 launch_index = optixGetLaunchIndex(); - - float3* output_buffer = reinterpret_cast( - render_params.output_buffer); - int pixel = launch_index.y * launch_dims.x + launch_index.x; - output_buffer[pixel] = make_float3(result.x, result.y, result.z); -} diff --git a/src/testrender/cuda/wrapper.h b/src/testrender/cuda/wrapper.h deleted file mode 100644 index c939f6064..000000000 --- a/src/testrender/cuda/wrapper.h +++ /dev/null @@ -1,4 +0,0 @@ -#pragma once - -#define RAYTRACER_HIT_QUAD 0 -#define RAYTRACER_HIT_SPHERE 1 diff --git a/src/testrender/optics.h b/src/testrender/optics.h index 5a709a065..a266eef2e 100644 --- a/src/testrender/optics.h +++ b/src/testrender/optics.h @@ -9,7 +9,7 @@ OSL_NAMESPACE_ENTER -inline float +static inline OSL_HOSTDEVICE float fresnel_dielectric(float cosi, float eta) { // special case: ignore fresnel @@ -30,7 +30,7 @@ fresnel_dielectric(float cosi, float eta) return 1.0f; // TIR (no refracted component) } -inline float +static inline OSL_HOSTDEVICE float fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T) { // compute refracted direction and fresnel term @@ -63,7 +63,7 @@ fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T) return 0; } -Color3 +static inline OSL_HOSTDEVICE Color3 fresnel_conductor(float cos_theta, Color3 n, Color3 k) { cos_theta = OIIO::clamp(cos_theta, 0.0f, 1.0f); @@ -89,7 +89,7 @@ fresnel_conductor(float cos_theta, Color3 n, Color3 k) return 0.5f * (rp + rs); } -inline float +static inline OSL_HOSTDEVICE float fresnel_schlick(float cos_theta, float F0, float F90) { float x = OIIO::clamp(1.0f - cos_theta, 0.0f, 1.0f); @@ -99,7 +99,7 @@ fresnel_schlick(float cos_theta, float F0, float F90) return OIIO::lerp(F0, F90, x5); } -inline Color3 +static inline OSL_HOSTDEVICE Color3 fresnel_generalized_schlick(float cos_theta, Color3 F0, Color3 F90, float exponent) { diff --git a/src/testrender/optixraytracer.cpp b/src/testrender/optixraytracer.cpp index 4310bd05c..ab5aff9d1 100644 --- a/src/testrender/optixraytracer.cpp +++ b/src/testrender/optixraytracer.cpp @@ -14,6 +14,7 @@ #include "render_params.h" #include +#include #include #include #include @@ -82,6 +83,12 @@ OSL_NAMESPACE_ENTER } \ } + +#define DEVICE_ALLOC(size) reinterpret_cast(device_alloc(size)) +#define COPY_TO_DEVICE(dst_device, src_host, size) \ + copy_to_device(reinterpret_cast(dst_device), src_host, size) + + static void context_log_cb(unsigned int level, const char* tag, const char* message, void* /*cbdata */) @@ -115,8 +122,10 @@ OptixRaytracer::~OptixRaytracer() { if (m_optix_ctx) OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx)); - for (CUdeviceptr ptr : device_ptrs) + for (CUdeviceptr ptr : m_ptrs_to_free) cudaFree(reinterpret_cast(ptr)); + for (cudaArray_t arr : m_arrays_to_free) + cudaFreeArray(arr); } @@ -130,6 +139,7 @@ OptixRaytracer::device_alloc(size_t size) errhandler().errorfmt("cudaMalloc({}) failed with error: {}\n", size, cudaGetErrorString(res)); } + m_ptrs_to_free.push_back(reinterpret_cast(ptr)); return ptr; } @@ -234,18 +244,13 @@ OptixRaytracer::synch_attributes() const size_t podDataSize = cpuDataSize - sizeof(ustringhash) * numStrings; - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_color_system), - podDataSize + sizeof(ustringhash_pod) * numStrings)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_color_system), colorSys, - podDataSize, cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_color_system); + d_color_system = DEVICE_ALLOC(podDataSize + + sizeof(ustringhash_pod) * numStrings); + COPY_TO_DEVICE(d_color_system, colorSys, podDataSize); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_osl_printf_buffer), - OSL_PRINTF_BUFFER_SIZE)); + d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE); CUDA_CHECK(cudaMemset(reinterpret_cast(d_osl_printf_buffer), 0, OSL_PRINTF_BUFFER_SIZE)); - device_ptrs.push_back(d_osl_printf_buffer); // then copy the device string to the end, first strings starting at dataPtr - (numStrings) // FIXME -- Should probably handle alignment better. @@ -257,8 +262,7 @@ OptixRaytracer::synch_attributes() for (const ustringhash* end = cpuStringHash + numStrings; cpuStringHash < end; ++cpuStringHash) { ustringhash_pod devStr = cpuStringHash->hash(); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(gpuStrings), &devStr, - sizeof(devStr), cudaMemcpyHostToDevice)); + COPY_TO_DEVICE(gpuStrings, &devStr, sizeof(devStr)); gpuStrings += sizeof(ustringhash_pod); } } @@ -337,8 +341,8 @@ OptixRaytracer::create_modules() m_pipeline_compile_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY; m_pipeline_compile_options.usesMotionBlur = false; - m_pipeline_compile_options.numPayloadValues = 0; - m_pipeline_compile_options.numAttributeValues = 2; + m_pipeline_compile_options.numPayloadValues = 4; + m_pipeline_compile_options.numAttributeValues = 3; m_pipeline_compile_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW; m_pipeline_compile_options.pipelineLaunchParamsVariableName @@ -360,8 +364,6 @@ OptixRaytracer::create_modules() load_optix_module("optix_raytracer.ptx", &m_module_compile_options, &m_pipeline_compile_options, &m_program_module); - load_optix_module("wrapper.ptx", &m_module_compile_options, - &m_pipeline_compile_options, &m_wrapper_module); load_optix_module("rend_lib_testrender.ptx", &m_module_compile_options, &m_pipeline_compile_options, &m_rend_lib_module); @@ -402,7 +404,7 @@ OptixRaytracer::create_programs() OptixProgramGroupDesc raygen_desc = {}; raygen_desc.kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN; raygen_desc.raygen.module = m_program_module; - raygen_desc.raygen.entryFunctionName = "__raygen__"; + raygen_desc.raygen.entryFunctionName = "__raygen__deferred"; create_optix_pg(&raygen_desc, 1, &m_program_options, &m_raygen_group); // Set Globals Raygen group @@ -439,9 +441,8 @@ OptixRaytracer::create_programs() // Hitgroup -- triangles OptixProgramGroupDesc tri_hitgroup_desc = {}; tri_hitgroup_desc.kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP; - tri_hitgroup_desc.hitgroup.moduleCH = m_wrapper_module; - tri_hitgroup_desc.hitgroup.entryFunctionNameCH - = "__closesthit__closest_hit_osl"; + tri_hitgroup_desc.hitgroup.moduleCH = m_program_module; + tri_hitgroup_desc.hitgroup.entryFunctionNameCH = "__closesthit__deferred"; create_optix_pg(&tri_hitgroup_desc, 1, &m_program_options, &m_closesthit_group); @@ -525,7 +526,7 @@ OptixRaytracer::create_shaders() // Create Programs from the init and group_entry functions, // and set the OSL functions as Callable Programs so that they - // can be executed by the closest hit program in the wrapper + // can be executed by the closest hit program. sizeof_msg_log = sizeof(msg_log); OPTIX_CHECK_MSG(optixModuleCreateFn(m_optix_ctx, &m_module_compile_options, @@ -556,13 +557,10 @@ OptixRaytracer::create_shaders() } // Upload per-material interactive buffer table - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_interactive_params), - sizeof(void*) * material_interactive_params.size())); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_interactive_params), - material_interactive_params.data(), - sizeof(void*) * material_interactive_params.size(), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_interactive_params); + d_interactive_params = DEVICE_ALLOC(sizeof(void*) + * material_interactive_params.size()); + COPY_TO_DEVICE(d_interactive_params, material_interactive_params.data(), + sizeof(void*) * material_interactive_params.size()); } @@ -650,12 +648,8 @@ OptixRaytracer::create_sbt() GenericRecord raygen_record; CUdeviceptr d_raygen_record; OPTIX_CHECK(optixSbtRecordPackHeader(m_raygen_group, &raygen_record)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_raygen_record), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_raygen_record), - &raygen_record, sizeof(GenericRecord), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_raygen_record); + d_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord)); + COPY_TO_DEVICE(d_raygen_record, &raygen_record, sizeof(GenericRecord)); m_optix_sbt.raygenRecord = d_raygen_record; } @@ -665,12 +659,8 @@ OptixRaytracer::create_sbt() GenericRecord miss_record; CUdeviceptr d_miss_record; OPTIX_CHECK(optixSbtRecordPackHeader(m_miss_group, &miss_record)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_miss_record), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_miss_record), - &miss_record, sizeof(GenericRecord), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_miss_record); + d_miss_record = DEVICE_ALLOC(sizeof(GenericRecord)); + COPY_TO_DEVICE(d_miss_record, &miss_record, sizeof(GenericRecord)); m_optix_sbt.missRecordBase = d_miss_record; m_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord); @@ -685,13 +675,9 @@ OptixRaytracer::create_sbt() OPTIX_CHECK( optixSbtRecordPackHeader(m_closesthit_group, &hitgroup_records[0])); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_hitgroup_records), - nhitgroups * sizeof(GenericRecord))); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_hitgroup_records), - &hitgroup_records[0], - nhitgroups * sizeof(GenericRecord), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_hitgroup_records); + d_hitgroup_records = DEVICE_ALLOC(nhitgroups * sizeof(GenericRecord)); + COPY_TO_DEVICE(d_hitgroup_records, &hitgroup_records[0], + nhitgroups * sizeof(GenericRecord)); m_optix_sbt.hitgroupRecordBase = d_hitgroup_records; m_optix_sbt.hitgroupRecordStrideInBytes = sizeof(GenericRecord); @@ -709,17 +695,18 @@ OptixRaytracer::create_sbt() &callable_records[idx])); } - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_callable_records), - (nshaders) * sizeof(GenericRecord))); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_callable_records), - callable_records.data(), - (nshaders) * sizeof(GenericRecord), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_callable_records); + d_callable_records = DEVICE_ALLOC((nshaders) * sizeof(GenericRecord)); + COPY_TO_DEVICE(d_callable_records, callable_records.data(), + (nshaders) * sizeof(GenericRecord)); m_optix_sbt.callablesRecordBase = d_callable_records; m_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord); m_optix_sbt.callablesRecordCount = nshaders; + + m_setglobals_optix_sbt.callablesRecordBase = d_callable_records; + m_setglobals_optix_sbt.callablesRecordStrideInBytes = sizeof( + GenericRecord); + m_setglobals_optix_sbt.callablesRecordCount = nshaders; } // SetGlobals raygen @@ -728,13 +715,9 @@ OptixRaytracer::create_sbt() CUdeviceptr d_setglobals_raygen_record; OPTIX_CHECK( optixSbtRecordPackHeader(m_setglobals_raygen_group, &record)); - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_setglobals_raygen_record), - sizeof(GenericRecord))); - CUDA_CHECK( - cudaMemcpy(reinterpret_cast(d_setglobals_raygen_record), - &record, sizeof(GenericRecord), cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_setglobals_raygen_record); + d_setglobals_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord)); + COPY_TO_DEVICE(d_setglobals_raygen_record, &record, + sizeof(GenericRecord)); m_setglobals_optix_sbt.raygenRecord = d_setglobals_raygen_record; } @@ -744,13 +727,9 @@ OptixRaytracer::create_sbt() GenericRecord record; CUdeviceptr d_setglobals_miss_record; OPTIX_CHECK(optixSbtRecordPackHeader(m_setglobals_miss_group, &record)); - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_setglobals_miss_record), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_setglobals_miss_record), - &record, sizeof(GenericRecord), - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_setglobals_miss_record); + d_setglobals_miss_record = DEVICE_ALLOC(sizeof(GenericRecord)); + COPY_TO_DEVICE(d_setglobals_miss_record, &record, + sizeof(GenericRecord)); m_setglobals_optix_sbt.missRecordBase = d_setglobals_miss_record; m_setglobals_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord); @@ -772,7 +751,6 @@ OptixRaytracer::cleanup_programs() m_shader_modules.clear(); optixModuleDestroy(m_program_module); - optixModuleDestroy(m_wrapper_module); optixModuleDestroy(m_rend_lib_module); optixModuleDestroy(m_shadeops_module); } @@ -791,20 +769,12 @@ OptixRaytracer::build_accel() accel_options.operation = OPTIX_BUILD_OPERATION_BUILD; const size_t vertices_size = sizeof(Vec3) * scene.verts.size(); - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_vertices), vertices_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_vertices), - scene.verts.data(), vertices_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_vertices); + d_vertices = DEVICE_ALLOC(vertices_size); + COPY_TO_DEVICE(d_vertices, scene.verts.data(), vertices_size); const size_t indices_size = scene.triangles.size() * sizeof(int32_t) * 3; - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_vert_indices), indices_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_vert_indices), - scene.triangles.data(), indices_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_vert_indices); + d_vert_indices = DEVICE_ALLOC(indices_size); + COPY_TO_DEVICE(d_vert_indices, scene.triangles.data(), indices_size); const uint32_t triangle_input_flags[1] = { OPTIX_GEOMETRY_FLAG_NONE }; OptixBuildInput triangle_input = {}; @@ -830,9 +800,7 @@ OptixRaytracer::build_accel() CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_temp_buffer), gas_buffer_sizes.tempSizeInBytes)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_accel_output_buffer), - gas_buffer_sizes.outputSizeInBytes)); - device_ptrs.push_back(d_accel_output_buffer); + d_accel_output_buffer = DEVICE_ALLOC(gas_buffer_sizes.outputSizeInBytes); OPTIX_CHECK(optixAccelBuild( m_optix_ctx, 0, &accel_options, &triangle_input, 1, d_temp_buffer, @@ -844,65 +812,64 @@ OptixRaytracer::build_accel() +void +OptixRaytracer::prepare_background() +{ + if (getBackgroundShaderID() >= 0) { + const int bg_res = std::max(32, getBackgroundResolution()); + d_bg_values = DEVICE_ALLOC(3 * sizeof(float) * bg_res * bg_res); + d_bg_rows = DEVICE_ALLOC(sizeof(float) * bg_res); + d_bg_cols = DEVICE_ALLOC(sizeof(float) * bg_res * bg_res); + } +} + + + void OptixRaytracer::upload_mesh_data() { // Upload the extra geometry data to the device const size_t uvs_size = sizeof(Vec2) * scene.uvs.size(); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_uvs), uvs_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_uvs), scene.uvs.data(), - uvs_size, cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_uvs); + d_uvs = DEVICE_ALLOC(uvs_size); + COPY_TO_DEVICE(d_uvs, scene.uvs.data(), uvs_size); const size_t uv_indices_size = scene.uv_triangles.size() * sizeof(int32_t) * 3; - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_uv_indices), uv_indices_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_uv_indices), - scene.uv_triangles.data(), uv_indices_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_uv_indices); + d_uv_indices = DEVICE_ALLOC(uv_indices_size); + COPY_TO_DEVICE(d_uv_indices, scene.uv_triangles.data(), uv_indices_size); const size_t normals_size = sizeof(Vec3) * scene.normals.size(); if (normals_size > 0) { - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_normals), normals_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_normals), - scene.normals.data(), normals_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_normals); + d_normals = DEVICE_ALLOC(normals_size); + COPY_TO_DEVICE(d_normals, scene.normals.data(), normals_size); } const size_t normal_indices_size = scene.n_triangles.size() * sizeof(int32_t) * 3; - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_normal_indices), - normal_indices_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_normal_indices), - scene.n_triangles.data(), normal_indices_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_normal_indices); + d_normal_indices = DEVICE_ALLOC(normal_indices_size); + COPY_TO_DEVICE(d_normal_indices, scene.n_triangles.data(), + normal_indices_size); const size_t shader_ids_size = scene.shaderids.size() * sizeof(int); - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_shader_ids), shader_ids_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_shader_ids), - scene.shaderids.data(), shader_ids_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_shader_ids); + d_shader_ids = DEVICE_ALLOC(shader_ids_size); + COPY_TO_DEVICE(d_shader_ids, scene.shaderids.data(), shader_ids_size); // TODO: These could be packed, but for now just use ints instead of bools std::vector shader_is_light; for (const bool& is_light : OptixRaytracer::shader_is_light()) shader_is_light.push_back(is_light); - const size_t shader_is_light_size = shader_is_light.size() * sizeof(int32_t); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_shader_is_light), - shader_is_light_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_shader_is_light), - shader_is_light.data(), shader_is_light_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_shader_is_light); + d_shader_is_light = DEVICE_ALLOC(shader_is_light_size); + COPY_TO_DEVICE(d_shader_is_light, shader_is_light.data(), + shader_is_light_size); + + + const size_t lightprims_size = OptixRaytracer::lightprims().size() + * sizeof(uint32_t); + d_lightprims = DEVICE_ALLOC(lightprims_size); + COPY_TO_DEVICE(d_lightprims, OptixRaytracer::lightprims().data(), + lightprims_size); // Copy the mesh ID for each triangle to the device std::vector mesh_ids; @@ -913,11 +880,8 @@ OptixRaytracer::upload_mesh_data() mesh_ids.push_back(meshid); } const size_t mesh_ids_size = mesh_ids.size() * sizeof(int32_t); - CUDA_CHECK( - cudaMalloc(reinterpret_cast(&d_mesh_ids), mesh_ids_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_mesh_ids), mesh_ids.data(), - mesh_ids_size, cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_mesh_ids); + d_mesh_ids = DEVICE_ALLOC(mesh_ids_size); + COPY_TO_DEVICE(d_mesh_ids, mesh_ids.data(), mesh_ids_size); // Copy the mesh surface areas to the device std::vector mesh_surfacearea; @@ -936,12 +900,9 @@ OptixRaytracer::upload_mesh_data() const size_t mesh_surfacearea_size = mesh_surfacearea.size() * sizeof(float); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_surfacearea), - mesh_surfacearea_size)); - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_surfacearea), - mesh_surfacearea.data(), mesh_surfacearea_size, - cudaMemcpyHostToDevice)); - device_ptrs.push_back(d_surfacearea); + d_surfacearea = DEVICE_ALLOC(mesh_surfacearea_size); + COPY_TO_DEVICE(d_surfacearea, mesh_surfacearea.data(), + mesh_surfacearea_size); } @@ -962,54 +923,86 @@ OptixRaytracer::good(TextureHandle* handle OSL_MAYBE_UNUSED) RendererServices::TextureHandle* OptixRaytracer::get_texture_handle(ustring filename, ShadingContext* /*shading_context*/, - const TextureOpt* options) + const TextureOpt* /*options*/) { auto itr = m_samplers.find(filename); if (itr == m_samplers.end()) { - // Open image + // Open image to check the number of mip levels OIIO::ImageBuf image; if (!image.init_spec(filename, 0, 0)) { errhandler().errorfmt("Could not load: {} (hash {})", filename, filename); return (TextureHandle*)nullptr; } - - OIIO::ROI roi = OIIO::get_roi_full(image.spec()); - int32_t width = roi.width(), height = roi.height(); - std::vector pixels(width * height * 4); - - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]); - } - } - cudaResourceDesc res_desc = {}; + int32_t nmiplevels = std::max(image.nmiplevels(), 1); + int32_t img_width = image.xmax() + 1; + int32_t img_height = image.ymax() + 1; // hard-code textures to 4 channels - int32_t pitch = width * 4 * sizeof(float); cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); - // TODO: Free this memory + cudaMipmappedArray_t mipmapArray; + cudaExtent extent = make_cudaExtent(img_width, img_height, 0); + CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent, + nmiplevels)); + + // Copy the pixel data for each mip level + std::vector> level_pixels(nmiplevels); + for (int32_t level = 0; level < nmiplevels; ++level) { + image.reset(filename, 0, level); + OIIO::ROI roi = OIIO::get_roi_full(image.spec()); + if (!roi.defined()) { + errhandler().errorfmt( + "Could not load mip level {}: {} (hash {})", level, + filename, filename); + return (TextureHandle*)nullptr; + } + + int32_t width = roi.width(), height = roi.height(); + level_pixels[level].resize(width * height * 4); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + image.getpixel(i, j, 0, + &level_pixels[level][((j * width) + i) * 4]); + } + } + + cudaArray_t miplevelArray; + CUDA_CHECK( + cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level)); + + // Copy the texel data into the miplevel array + int32_t pitch = width * 4 * sizeof(float); + CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0, + level_pixels[level].data(), pitch, + pitch, height, + cudaMemcpyHostToDevice)); + } + + int32_t pitch = img_width * 4 * sizeof(float); cudaArray_t pixelArray; - CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height)); - CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, pixels.data(), pitch, - pitch, height, cudaMemcpyHostToDevice)); - - res_desc.resType = cudaResourceTypeArray; - res_desc.res.array.array = pixelArray; - - cudaTextureDesc tex_desc = {}; - tex_desc.addressMode[0] = cudaAddressModeWrap; - tex_desc.addressMode[1] = cudaAddressModeWrap; - tex_desc.filterMode = cudaFilterModeLinear; - tex_desc.readMode - = cudaReadModeElementType; //cudaReadModeNormalizedFloat; + CUDA_CHECK( + cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height)); + CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(), + pitch, pitch, img_height, + cudaMemcpyHostToDevice)); + m_arrays_to_free.push_back(pixelArray); + + cudaResourceDesc res_desc = {}; + res_desc.resType = cudaResourceTypeMipmappedArray; + res_desc.res.mipmap.mipmap = mipmapArray; + + cudaTextureDesc tex_desc = {}; + tex_desc.addressMode[0] = cudaAddressModeWrap; + tex_desc.addressMode[1] = cudaAddressModeWrap; + tex_desc.filterMode = cudaFilterModeLinear; + tex_desc.readMode = cudaReadModeElementType; tex_desc.normalizedCoords = 1; tex_desc.maxAnisotropy = 1; - tex_desc.maxMipmapLevelClamp = 99; + tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1); tex_desc.minMipmapLevelClamp = 0; - tex_desc.mipmapFilterMode = cudaFilterModePoint; + tex_desc.mipmapFilterMode = cudaFilterModeLinear; tex_desc.borderColor[0] = 1.0f; tex_desc.sRGB = 0; @@ -1034,8 +1027,10 @@ OptixRaytracer::prepare_render() // Set up the OptiX scene graph build_accel(); + prepare_lights(); upload_mesh_data(); make_optix_materials(); + prepare_background(); } @@ -1054,16 +1049,18 @@ OptixRaytracer::warmup() void OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED) { - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_output_buffer), - xres * yres * 4 * sizeof(float))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_launch_params), - sizeof(RenderParams))); - device_ptrs.push_back(d_output_buffer); - device_ptrs.push_back(d_launch_params); + d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float)); + d_launch_params = DEVICE_ALLOC(sizeof(RenderParams)); m_xres = xres; m_yres = yres; + const int aa = std::max(1, options.get_int("aa")); + const int max_bounces = options.get_int("max_bounces"); + const bool no_jitter = options.get_int("no_jitter"); + const float show_albedo_scale = options.get_float("show_albedo_scale"); + const int show_globals = options.get_int("show_globals"); + RenderParams params; params.eye.x = camera.eye.x; params.eye.y = camera.eye.y; @@ -1071,14 +1068,15 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED) params.dir.x = camera.dir.x; params.dir.y = camera.dir.y; params.dir.z = camera.dir.z; - params.cx.x = camera.cx.x; - params.cx.y = camera.cx.y; - params.cx.z = camera.cx.z; - params.cy.x = camera.cy.x; - params.cy.y = camera.cy.y; - params.cy.z = camera.cy.z; - params.invw = 1.0f / m_xres; - params.invh = 1.0f / m_yres; + params.up.x = camera.up.x; + params.up.y = camera.up.y; + params.up.z = camera.up.z; + params.fov = camera.fov; + params.aa = aa; + params.max_bounces = max_bounces; + params.show_albedo_scale = show_albedo_scale; + params.show_globals = show_globals; + params.no_jitter = no_jitter; params.interactive_params = d_interactive_params; params.output_buffer = d_output_buffer; params.traversal_handle = m_travHandle; @@ -1098,16 +1096,24 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED) params.normal_indices = d_normal_indices; params.shader_ids = d_shader_ids; params.shader_is_light = d_shader_is_light; + params.lightprims = d_lightprims; + params.lightprims_size = OptixRaytracer::lightprims().size(); params.mesh_ids = d_mesh_ids; params.surfacearea = d_surfacearea; - CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_launch_params), ¶ms, - sizeof(RenderParams), cudaMemcpyHostToDevice)); + // For the background shader + params.bg_res = std::max(32, getBackgroundResolution()); + params.bg_id = getBackgroundShaderID(); + params.bg_values = d_bg_values; + params.bg_rows = d_bg_rows; + params.bg_cols = d_bg_cols; + + COPY_TO_DEVICE(d_launch_params, ¶ms, sizeof(RenderParams)); // Set up global variables OPTIX_CHECK(optixLaunch(m_optix_pipeline, m_cuda_stream, d_launch_params, - sizeof(RenderParams), &m_setglobals_optix_sbt, 1, 1, - 1)); + sizeof(RenderParams), &m_setglobals_optix_sbt, 32, + 1, 1)); CUDA_SYNC_CHECK(); // Launch real render diff --git a/src/testrender/optixraytracer.h b/src/testrender/optixraytracer.h index 69d0c6c5b..b15a3147d 100644 --- a/src/testrender/optixraytracer.h +++ b/src/testrender/optixraytracer.h @@ -45,6 +45,7 @@ class OptixRaytracer final : public SimpleRaytracer { void cleanup_programs(); void build_accel(); void upload_mesh_data(); + void prepare_background(); void prepare_render() override; void warmup() override; void render(int xres, int yres) override; @@ -87,7 +88,6 @@ class OptixRaytracer final : public SimpleRaytracer { OptixPipelineLinkOptions m_pipeline_link_options = {}; OptixProgramGroupOptions m_program_options = {}; OptixModule m_program_module = {}; - OptixModule m_wrapper_module = {}; OptixModule m_rend_lib_module = {}; OptixModule m_shadeops_module = {}; OptixProgramGroup m_raygen_group = {}; @@ -115,7 +115,11 @@ class OptixRaytracer final : public SimpleRaytracer { CUdeviceptr d_shader_is_light = 0; CUdeviceptr d_mesh_ids = 0; CUdeviceptr d_surfacearea = 0; + CUdeviceptr d_lightprims = 0; CUdeviceptr d_interactive_params = 0; + CUdeviceptr d_bg_values = 0; + CUdeviceptr d_bg_rows = 0; + CUdeviceptr d_bg_cols = 0; CUdeviceptr d_osl_printf_buffer = 0; CUdeviceptr d_color_system = 0; @@ -137,7 +141,9 @@ class OptixRaytracer final : public SimpleRaytracer { std::string m_materials_ptx; std::unordered_map m_samplers; - std::vector device_ptrs; + // CUdeviceptrs that need to be freed after we are done + std::vector m_ptrs_to_free; + std::vector m_arrays_to_free; }; diff --git a/src/testrender/raytracer.h b/src/testrender/raytracer.h index cf640a8cb..ebf6c94c4 100644 --- a/src/testrender/raytracer.h +++ b/src/testrender/raytracer.h @@ -21,6 +21,10 @@ # include // from CUDA #endif +#ifdef __CUDACC__ +# include "cuda/rend_lib.h" +#endif + // The primitives don't included the intersection routines, etc., from the // versions in testrender, since those operations are performed on the GPU. // @@ -33,7 +37,7 @@ class OptixRenderer; // FIXME -- should not be here // build two vectors orthogonal to the first, assumes n is normalized -inline void +inline OSL_HOSTDEVICE void ortho(const Vec3& n, Vec3& x, Vec3& y) { x = (fabsf(n.x) > .01f ? Vec3(n.z, 0, -n.x) : Vec3(0, -n.z, n.y)) @@ -55,6 +59,7 @@ struct Ray { DISPLACEMENT = 128 }; + OSL_HOSTDEVICE Ray(const Vec3& o, const Vec3& d, float radius, float spread, RayType raytype) : origin(o) @@ -65,7 +70,10 @@ struct Ray { { } + OSL_HOSTDEVICE Vec3 point(float t) const { return origin + direction * t; } + + OSL_HOSTDEVICE Dual2 dual_direction() const { Dual2 v; @@ -76,6 +84,7 @@ struct Ray { return v; } + OSL_HOSTDEVICE Dual2 point(Dual2 t) const { const float r = radius + spread * t.val(); @@ -95,9 +104,10 @@ struct Ray { struct Camera { - Camera() {} + OSL_HOSTDEVICE Camera() {} // Set where the camera sits and looks at. + OSL_HOSTDEVICE void lookat(const Vec3& eye, const Vec3& dir, const Vec3& up, float fov) { this->eye = eye; @@ -108,6 +118,7 @@ struct Camera { } // Set resolution + OSL_HOSTDEVICE void resolution(int w, int h) { xres = w; @@ -118,6 +129,7 @@ struct Camera { } // Compute all derived values based on camera parameters. + OSL_HOSTDEVICE void finalize() { float k = OIIO::fast_tan(fov * float(M_PI / 360)); @@ -127,10 +139,21 @@ struct Camera { } // Get a ray for the given screen coordinates. + OSL_HOSTDEVICE Ray get(float x, float y) const { + // TODO: On CUDA devices, the normalize() operation can result in vector + // components with magnitudes slightly greater than 1.0, which can cause + // downstream computations to blow up and produce NaNs. Normalizing the + // vector again avoids this issue. const Vec3 v = (cx * (x * invw - 0.5f) + cy * (0.5f - y * invh) + dir) +#ifndef __CUDACC__ .normalize(); +#else + .normalize() + .normalized(); +#endif + const float cos_a = dir.dot(v); const float spread = sqrtf(invw * invh * cx.length() * cy.length() * cos_a) * cos_a; @@ -164,6 +187,7 @@ struct LightSample { using ShaderMap = std::unordered_map; struct Scene { +#ifndef __CUDACC__ void add_sphere(const Vec3& c, float r, int shaderID, int resolution); void add_quad(const Vec3& p, const Vec3& ex, const Vec3& ey, int shaderID, @@ -176,11 +200,16 @@ struct Scene { int num_prims() const { return triangles.size(); } void prepare(OIIO::ErrorHandler& errhandler); +#endif + // NB: OptiX needs to populate the ShaderGlobals in the closest-hit program, + // so we need to pass along a pointer to the struct. + OSL_HOSTDEVICE Intersection intersect(const Ray& r, const float tmax, const unsigned skipID1, const unsigned skipID2 = ~0u) const; + OSL_HOSTDEVICE LightSample sample(int primID, const Vec3& x, float xi, float yi) const { // A Low-Distortion Map Between Triangle and Square @@ -205,6 +234,7 @@ struct Scene { return { dir, sqrtf(d2), pdf, xi, yi }; } + OSL_HOSTDEVICE float shapepdf(int primID, const Vec3& x, const Vec3& p) const { const Vec3 va = verts[triangles[primID].a]; @@ -219,6 +249,7 @@ struct Scene { return d2 / (0.5f * fabsf(dir.dot(n))); } + OSL_HOSTDEVICE float primitivearea(int primID) const { const Vec3 va = verts[triangles[primID].a]; @@ -227,6 +258,7 @@ struct Scene { return 0.5f * (va - vb).cross(va - vc).length(); } + OSL_HOSTDEVICE Vec3 normal(const Dual2& p, Vec3& Ng, int primID, float u, float v) const { @@ -246,6 +278,7 @@ struct Scene { return ((1 - u - v) * na + u * nb + v * nc).normalize(); } + OSL_HOSTDEVICE Dual2 uv(const Dual2& p, const Vec3& n, Vec3& dPdu, Vec3& dPdv, int primID, float u, float v) const { @@ -273,8 +306,9 @@ struct Scene { return Dual2((1 - u - v) * ta + u * tb + v * tc); } - int shaderid(int primID) const { return shaderids[primID]; } + OSL_HOSTDEVICE int shaderid(int primID) const { return shaderids[primID]; } +#ifndef __CUDACC__ // basic triangle data std::vector verts; std::vector normals; @@ -287,6 +321,16 @@ struct Scene { last_index; // one entry per mesh, stores the last triangle index (+1) -- also is the start triangle of the next mesh // acceleration structure (built over triangles) std::unique_ptr bvh; +#else + const Vec3* verts; + const Vec3* normals; + const Vec2* uvs; + const TriangleIndices* triangles; + const TriangleIndices* uv_triangles; + const TriangleIndices* n_triangles; + const int* shaderids; + OptixTraversableHandle handle; +#endif }; OSL_NAMESPACE_EXIT diff --git a/src/testrender/render_params.h b/src/testrender/render_params.h index 5d89c83f9..b13059a6c 100644 --- a/src/testrender/render_params.h +++ b/src/testrender/render_params.h @@ -15,11 +15,13 @@ struct RenderParams { float3 eye; float3 dir; - float3 cx; - float3 cy; - - float invw; - float invh; + float3 up; + float fov; + int aa; + int max_bounces; + float show_albedo_scale; + bool no_jitter; + int show_globals; CUdeviceptr traversal_handle; CUdeviceptr output_buffer; @@ -50,6 +52,15 @@ struct RenderParams { CUdeviceptr shader_is_light; CUdeviceptr mesh_ids; CUdeviceptr surfacearea; + CUdeviceptr lightprims; + size_t lightprims_size; + + // for the background + int bg_res; + int bg_id; + CUdeviceptr bg_values; + CUdeviceptr bg_rows; + CUdeviceptr bg_cols; }; diff --git a/src/testrender/sampling.h b/src/testrender/sampling.h index e69f3f5f1..a8484617f 100644 --- a/src/testrender/sampling.h +++ b/src/testrender/sampling.h @@ -14,7 +14,7 @@ OSL_NAMESPACE_ENTER struct TangentFrame { // build frame from unit normal - static TangentFrame from_normal(const Vec3& n) + static OSL_HOSTDEVICE TangentFrame from_normal(const Vec3& n) { // https://graphics.pixar.com/library/OrthonormalB/paper.pdf const float sign = copysignf(1.0f, n.z); @@ -27,7 +27,8 @@ struct TangentFrame { // build frame from unit normal and unit tangent // fallsback to an arbitrary basis if the tangent is 0 or colinear with n - static TangentFrame from_normal_and_tangent(const Vec3& n, const Vec3& t) + static OSL_HOSTDEVICE TangentFrame from_normal_and_tangent(const Vec3& n, + const Vec3& t) { Vec3 x = t - n * dot(n, t); float xlen2 = dot(x, x); @@ -41,18 +42,24 @@ struct TangentFrame { } // transform vector - Vec3 get(float x, float y, float z) const { return x * u + y * v + z * w; } + Vec3 OSL_HOSTDEVICE get(float x, float y, float z) const + { + return x * u + y * v + z * w; + } // untransform vector - float getx(const Vec3& a) const { return a.dot(u); } - float gety(const Vec3& a) const { return a.dot(v); } - float getz(const Vec3& a) const { return a.dot(w); } + float OSL_HOSTDEVICE getx(const Vec3& a) const { return a.dot(u); } + float OSL_HOSTDEVICE gety(const Vec3& a) const { return a.dot(v); } + float OSL_HOSTDEVICE getz(const Vec3& a) const { return a.dot(w); } - Vec3 tolocal(const Vec3& a) const + Vec3 OSL_HOSTDEVICE tolocal(const Vec3& a) const { return Vec3(a.dot(u), a.dot(v), a.dot(w)); } - Vec3 toworld(const Vec3& a) const { return get(a.x, a.y, a.z); } + Vec3 OSL_HOSTDEVICE toworld(const Vec3& a) const + { + return get(a.x, a.y, a.z); + } Vec3 u, v, w; }; @@ -60,7 +67,7 @@ struct TangentFrame { struct Sampling { /// Warp the unit disk onto the unit sphere /// http://psgraphics.blogspot.com/2011/01/improved-code-for-concentric-map.html - static void to_unit_disk(float& x, float& y) + static OSL_HOSTDEVICE void to_unit_disk(float& x, float& y) { const float PI_OVER_4 = float(M_PI_4); const float PI_OVER_2 = float(M_PI_2); @@ -82,8 +89,9 @@ struct Sampling { y *= r; } - static void sample_cosine_hemisphere(const Vec3& N, float rndx, float rndy, - Vec3& out, float& pdf) + static OSL_HOSTDEVICE void sample_cosine_hemisphere(const Vec3& N, + float rndx, float rndy, + Vec3& out, float& pdf) { to_unit_disk(rndx, rndy); float cos_theta = sqrtf(std::max(1 - rndx * rndx - rndy * rndy, 0.0f)); @@ -91,8 +99,9 @@ struct Sampling { pdf = cos_theta * float(M_1_PI); } - static void sample_uniform_hemisphere(const Vec3& N, float rndx, float rndy, - Vec3& out, float& pdf) + static OSL_HOSTDEVICE void sample_uniform_hemisphere(const Vec3& N, + float rndx, float rndy, + Vec3& out, float& pdf) { float phi = float(2 * M_PI) * rndx; float cos_theta = rndy; @@ -118,7 +127,8 @@ struct MIS { // Centralizing the handling of the pdfs this way ensures that all numerical // cases can be enumerated and handled robustly without arbitrary epsilons. template - static inline float power_heuristic(float sampled_pdf, float other_pdf) + static inline OSL_HOSTDEVICE float power_heuristic(float sampled_pdf, + float other_pdf) { // NOTE: inf is ok! assert(sampled_pdf >= 0); @@ -159,9 +169,16 @@ struct MIS { // such as a BRDF mixture. This updates a (weight, pdf) pair with a new one // to represent the sum of both. b is the probability of choosing the provided // weight. A running sum should be started with a weight and pdf of 0. - static inline void update_eval(Color3* w, float* pdf, Color3 ow, float opdf, - float b) + static inline OSL_HOSTDEVICE void + update_eval(Color3* w, float* pdf, Color3 ow, float opdf, float b) { +#ifdef __CUDACC__ + // Check for those pesky NaNs + assert(*pdf == *pdf); + assert(b == b); + assert(opdf == opdf); +#endif + // NOTE: inf is ok! assert(*pdf >= 0); assert(opdf >= 0); @@ -193,6 +210,7 @@ struct MIS { // "Practical Hash-based Owen Scrambling" - Brent Burley - JCGT 2020 // https://jcgt.org/published/0009/04/01/ struct Sampler { + OSL_HOSTDEVICE Sampler(int px, int py, int si) : seed(((px & 2047) << 22) | ((py & 2047) << 11)) , index(reversebits(si)) @@ -200,6 +218,7 @@ struct Sampler { assert(si < (1 << 24)); } + OSL_HOSTDEVICE Vec3 get() { static const uint32_t zmatrix[24] = { @@ -234,7 +253,7 @@ struct Sampler { private: uint32_t seed, index; - static uint32_t hash(uint32_t s) + static OSL_HOSTDEVICE uint32_t hash(uint32_t s) { // https://github.com/skeeto/hash-prospector s ^= s >> 16; @@ -245,7 +264,7 @@ struct Sampler { return s; } - static uint32_t reversebits(uint32_t x) + static OSL_HOSTDEVICE uint32_t reversebits(uint32_t x) { #if defined(__clang__) return __builtin_bitreverse32(x); @@ -259,7 +278,7 @@ struct Sampler { #endif } - static uint32_t owen_scramble(uint32_t p, uint32_t s) + static OSL_HOSTDEVICE uint32_t owen_scramble(uint32_t p, uint32_t s) { // https://psychopath.io/post/2021_01_30_building_a_better_lk_hash // assumes reversed input diff --git a/src/testrender/shading.cpp b/src/testrender/shading.cpp index 78bd2004e..641e5abea 100644 --- a/src/testrender/shading.cpp +++ b/src/testrender/shading.cpp @@ -10,243 +10,39 @@ using namespace OSL; + +#ifndef __CUDACC__ +using ShaderGlobalsType = OSL::ShaderGlobals; +#else +using ShaderGlobalsType = OSL_CUDA::ShaderGlobals; +#endif + + namespace { // anonymous namespace using OIIO::clamp; +using OSL::dot; -Color3 +OSL_HOSTDEVICE Color3 clamp(const Color3& c, float min, float max) { return Color3(clamp(c.x, min, max), clamp(c.y, min, max), clamp(c.z, min, max)); } -bool +OSL_HOSTDEVICE bool is_black(const Color3& c) { return c.x == 0 && c.y == 0 && c.z == 0; } - - -// unique identifier for each closure supported by testrender -enum ClosureIDs { - EMISSION_ID = 1, - BACKGROUND_ID, - DIFFUSE_ID, - OREN_NAYAR_ID, - TRANSLUCENT_ID, - PHONG_ID, - WARD_ID, - MICROFACET_ID, - REFLECTION_ID, - FRESNEL_REFLECTION_ID, - REFRACTION_ID, - TRANSPARENT_ID, - // See MATERIALX_CLOSURES in stdosl.h - MX_OREN_NAYAR_DIFFUSE_ID, - MX_BURLEY_DIFFUSE_ID, - MX_DIELECTRIC_ID, - MX_CONDUCTOR_ID, - MX_GENERALIZED_SCHLICK_ID, - MX_TRANSLUCENT_ID, - MX_TRANSPARENT_ID, - MX_SUBSURFACE_ID, - MX_SHEEN_ID, - MX_UNIFORM_EDF_ID, - MX_ANISOTROPIC_VDF_ID, - MX_MEDIUM_VDF_ID, - MX_LAYER_ID, - // TODO: adding vdfs would require extending testrender with volume support ... -}; - -// these structures hold the parameters of each closure type -// they will be contained inside ClosureComponent -struct EmptyParams {}; -struct DiffuseParams { - Vec3 N; -}; -struct OrenNayarParams { - Vec3 N; - float sigma; -}; -struct PhongParams { - Vec3 N; - float exponent; -}; -struct WardParams { - Vec3 N, T; - float ax, ay; -}; -struct ReflectionParams { - Vec3 N; - float eta; -}; -struct RefractionParams { - Vec3 N; - float eta; -}; -struct MicrofacetParams { - ustringhash dist; - Vec3 N, U; - float xalpha, yalpha, eta; - int refract; -}; - -// MATERIALX_CLOSURES - -struct MxOrenNayarDiffuseParams { - Vec3 N; - Color3 albedo; - float roughness; - // optional - ustringhash label; - int energy_compensation; -}; - -struct MxBurleyDiffuseParams { - Vec3 N; - Color3 albedo; - float roughness; - // optional - ustringhash label; -}; - -// common to all MaterialX microfacet closures -struct MxMicrofacetBaseParams { - Vec3 N, U; - float roughness_x; - float roughness_y; - ustringhash distribution; - // optional - ustringhash label; -}; - -struct MxDielectricParams : public MxMicrofacetBaseParams { - Color3 reflection_tint; - Color3 transmission_tint; - float ior; - // optional - float thinfilm_thickness; - float thinfilm_ior; - - Color3 evalR(float cos_theta) const - { - return reflection_tint * fresnel_dielectric(cos_theta, ior); - } - - Color3 evalT(float cos_theta) const - { - return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior)); - } -}; - -struct MxConductorParams : public MxMicrofacetBaseParams { - Color3 ior; - Color3 extinction; - // optional - float thinfilm_thickness; - float thinfilm_ior; - - Color3 evalR(float cos_theta) const - { - return fresnel_conductor(cos_theta, ior, extinction); - } - - Color3 evalT(float cos_theta) const { return Color3(0.0f); } - - // Avoid function was declared but never referenced - // float get_ior() const - // { - // return 0; // no transmission possible - // } -}; - -struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams { - Color3 reflection_tint; - Color3 transmission_tint; - Color3 f0; - Color3 f90; - float exponent; - // optional - float thinfilm_thickness; - float thinfilm_ior; - - Color3 evalR(float cos_theta) const - { - return reflection_tint - * fresnel_generalized_schlick(cos_theta, f0, f90, exponent); - } - - Color3 evalT(float cos_theta) const - { - return transmission_tint - * (Color3(1.0f) - - fresnel_generalized_schlick(cos_theta, f0, f90, exponent)); - } -}; - -struct MxTranslucentParams { - Vec3 N; - Color3 albedo; - // optional - ustringhash label; -}; - -struct MxSubsurfaceParams { - Vec3 N; - Color3 albedo; - Color3 radius; - float anisotropy; - // optional - ustringhash label; -}; - -struct MxSheenParams { - Vec3 N; - Color3 albedo; - float roughness; - // optional - int mode; - ustringhash label; -}; - -struct MxUniformEdfParams { - Color3 emittance; - // optional - ustringhash label; -}; - -struct MxLayerParams { - OSL::ClosureColor* top; - OSL::ClosureColor* base; -}; - -struct MxAnisotropicVdfParams { - Color3 albedo; - Color3 extinction; - float anisotropy; - // optional - ustringhash label; -}; - -struct MxMediumVdfParams { - Color3 albedo; - float transmission_depth; - Color3 transmission_color; - float anisotropy; - float ior; - int priority; - // optional - ustringhash label; -}; - } // anonymous namespace OSL_NAMESPACE_ENTER +#ifndef __CUDACC__ void register_closures(OSL::ShadingSystem* shadingsys) { @@ -433,24 +229,26 @@ register_closures(OSL::ShadingSystem* shadingsys) for (const BuiltinClosures& b : builtins) shadingsys->register_closure(b.name, b.id, b.params, nullptr, nullptr); } +#endif // ifndef __CUDACC__ OSL_NAMESPACE_EXIT namespace { // anonymous namespace template struct Diffuse final : public BSDF, DiffuseParams { - Diffuse(const DiffuseParams& params) : BSDF(), DiffuseParams(params) + OSL_HOSTDEVICE Diffuse(const DiffuseParams& params) + : BSDF(DIFFUSE_ID), DiffuseParams(params) { if (trans) N = -N; } - Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const { const float pdf = std::max(N.dot(wi), 0.0f) * float(M_1_PI); return { wi, Color3(1.0f), pdf, 1.0f }; } - Sample sample(const Vec3& /*wo*/, float rx, float ry, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& /*wo*/, float rx, float ry, + float /*rz*/) const { Vec3 out_dir; float pdf; @@ -460,10 +258,11 @@ template struct Diffuse final : public BSDF, DiffuseParams { }; struct OrenNayar final : public BSDF, OrenNayarParams { - OrenNayar(const OrenNayarParams& params) : BSDF(), OrenNayarParams(params) + OSL_HOSTDEVICE OrenNayar(const OrenNayarParams& params) + : BSDF(OREN_NAYAR_ID), OrenNayarParams(params) { } - Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const { float NL = N.dot(wi); float NV = N.dot(wo); @@ -485,8 +284,8 @@ struct OrenNayar final : public BSDF, OrenNayarParams { } return {}; } - Sample sample(const Vec3& wo, float rx, float ry, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float /*rz*/) const { Vec3 out_dir; float pdf; @@ -496,11 +295,12 @@ struct OrenNayar final : public BSDF, OrenNayarParams { }; struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams { + OSL_HOSTDEVICE EnergyCompensatedOrenNayar(const MxOrenNayarDiffuseParams& params) - : BSDF(), MxOrenNayarDiffuseParams(params) + : BSDF(MX_OREN_NAYAR_DIFFUSE_ID), MxOrenNayarDiffuseParams(params) { } - Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const { float NL = N.dot(wi); float NV = N.dot(wo); @@ -537,8 +337,8 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams { return {}; } - Sample sample(const Vec3& wo, float rx, float ry, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float /*rz*/) const { Vec3 out_dir; float pdf; @@ -551,7 +351,7 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams { static constexpr float constant2_FON = float(2.0 / 3.0 - 28.0 / (15.0 * M_PI)); - float E_FON_analytic(float mu) const + OSL_HOSTDEVICE float E_FON_analytic(float mu) const { const float sigma = roughness; float AF = 1.0f @@ -567,8 +367,11 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams { }; struct Phong final : public BSDF, PhongParams { - Phong(const PhongParams& params) : BSDF(), PhongParams(params) {} - Sample eval(const Vec3& wo, const Vec3& wi) const override + OSL_HOSTDEVICE Phong(const PhongParams& params) + : BSDF(PHONG_ID), PhongParams(params) + { + } + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const { float cosNI = N.dot(wi); float cosNO = N.dot(wo); @@ -585,8 +388,8 @@ struct Phong final : public BSDF, PhongParams { } return {}; } - Sample sample(const Vec3& wo, float rx, float ry, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float /*rz*/) const { float cosNO = N.dot(wo); if (cosNO > 0) { @@ -607,8 +410,11 @@ struct Phong final : public BSDF, PhongParams { }; struct Ward final : public BSDF, WardParams { - Ward(const WardParams& params) : BSDF(), WardParams(params) {} - Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Ward(const WardParams& params) + : BSDF(WARD_ID), WardParams(params) + { + } + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const { float cosNO = N.dot(wo); float cosNI = N.dot(wi); @@ -632,8 +438,8 @@ struct Ward final : public BSDF, WardParams { } return {}; } - Sample sample(const Vec3& wo, float rx, float ry, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float /*rz*/) const { float cosNO = N.dot(wo); if (cosNO > 0) { @@ -710,17 +516,18 @@ struct Ward final : public BSDF, WardParams { * is sufficient). */ struct GGXDist { - static float F(const float tan_m2) + static OSL_HOSTDEVICE float F(const float tan_m2) { return 1 / (float(M_PI) * (1 + tan_m2) * (1 + tan_m2)); } - static float Lambda(const float a2) + static OSL_HOSTDEVICE float Lambda(const float a2) { return 0.5f * (-1.0f + sqrtf(1.0f + 1.0f / a2)); } - static Vec2 sampleSlope(float cos_theta, float randu, float randv) + static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu, + float randv) { // GGX Vec2 slope; @@ -747,12 +554,12 @@ struct GGXDist { }; struct BeckmannDist { - static float F(const float tan_m2) + static OSL_HOSTDEVICE float F(const float tan_m2) { return float(1 / M_PI) * OIIO::fast_exp(-tan_m2); } - static float Lambda(const float a2) + static OSL_HOSTDEVICE float Lambda(const float a2) { const float a = sqrtf(a2); return a < 1.6f ? (1.0f - 1.259f * a + 0.396f * a2) @@ -760,7 +567,8 @@ struct BeckmannDist { : 0.0f; } - static Vec2 sampleSlope(float cos_theta, float randu, float randv) + static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu, + float randv) { const float SQRT_PI_INV = 1 / sqrtf(float(M_PI)); float ct = cos_theta < 1e-6f ? 1e-6f : cos_theta; @@ -808,13 +616,13 @@ struct BeckmannDist { template struct Microfacet final : public BSDF, MicrofacetParams { - Microfacet(const MicrofacetParams& params) - : BSDF() + OSL_HOSTDEVICE Microfacet(const MicrofacetParams& params) + : BSDF(MICROFACET_ID) , MicrofacetParams(params) , tf(TangentFrame::from_normal_and_tangent(N, U)) { } - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { if (Refract == 2) return Color3(1.0f); @@ -823,7 +631,7 @@ struct Microfacet final : public BSDF, MicrofacetParams { float fr = fresnel_dielectric(N.dot(wo), eta); return Color3(Refract ? 1 - fr : fr); } - Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const { const Vec3 wo_l = tf.tolocal(wo); const Vec3 wi_l = tf.tolocal(wi); @@ -890,7 +698,8 @@ struct Microfacet final : public BSDF, MicrofacetParams { return {}; } - Sample sample(const Vec3& wo, float rx, float ry, float rz) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { const Vec3 wo_l = tf.tolocal(wo); const float cosNO = wo_l.z; @@ -948,9 +757,9 @@ struct Microfacet final : public BSDF, MicrofacetParams { } private: - static float SQR(float x) { return x * x; } + static OSL_HOSTDEVICE float SQR(float x) { return x * x; } - float evalLambda(const Vec3 w) const + OSL_HOSTDEVICE float evalLambda(const Vec3 w) const { float cosTheta2 = SQR(w.z); /* Have these two multiplied by sinTheta^2 for convenience */ @@ -959,15 +768,18 @@ struct Microfacet final : public BSDF, MicrofacetParams { return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2)); } - static float evalG2(float Lambda_i, float Lambda_o) + static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o) { // correlated masking-shadowing return 1 / (Lambda_i + Lambda_o + 1); } - static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); } + static OSL_HOSTDEVICE float evalG1(float Lambda_v) + { + return 1 / (Lambda_v + 1); + } - float evalD(const Vec3 Hr) const + OSL_HOSTDEVICE float evalD(const Vec3 Hr) const { float cosThetaM = Hr.z; if (cosThetaM > 0) { @@ -979,12 +791,20 @@ struct Microfacet final : public BSDF, MicrofacetParams { float tanThetaM2 = (cosPhi2st2 + sinPhi2st2) / cosThetaM2; - return Distribution::F(tanThetaM2) / (xalpha * yalpha * cosThetaM4); + const float val = Distribution::F(tanThetaM2) + / (xalpha * yalpha * cosThetaM4); +#ifndef __CUDACC__ + return val; +#else + // Filter out NaNs that can be produced when cosThetaM is very small. + return (val == val) ? val : 0; +#endif } return 0; } - Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const + OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu, + float randv) const { /* Project wo and stretch by alpha values */ Vec3 swo = wo; @@ -992,6 +812,10 @@ struct Microfacet final : public BSDF, MicrofacetParams { swo.y *= yalpha; swo = swo.normalize(); +#ifdef __CUDACC__ + swo = swo.normalize(); +#endif + // figure out angles for the incoming vector float cos_theta = std::max(swo.z, 0.0f); float cos_phi = 1; @@ -1029,11 +853,12 @@ typedef Microfacet MicrofacetBeckmannBoth; // We use the CRTP to inherit the parameters because each MaterialX closure uses a different set of parameters -template struct MxMicrofacet final : public BSDF, MxMicrofacetParams { - MxMicrofacet(const MxMicrofacetParams& params, float refraction_ior) - : BSDF() + OSL_HOSTDEVICE MxMicrofacet(const MxMicrofacetParams& params, + float refraction_ior) + : BSDF(ID) , MxMicrofacetParams(params) , tf(TangentFrame::from_normal_and_tangent(MxMicrofacetParams::N, MxMicrofacetParams::U)) @@ -1041,7 +866,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { { } - float get_fresnel_angle(float cos_theta) const + OSL_HOSTDEVICE float get_fresnel_angle(float cos_theta) const { if (EnableTransmissionLobe && refraction_ior < 1) { // handle TIR if we are on the backside @@ -1055,7 +880,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { return cos_theta; } - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { // if transmission is enabled, punt on if (EnableTransmissionLobe) @@ -1067,7 +892,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { get_fresnel_angle(MxMicrofacetParams::N.dot(wo))); } - Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const { const Vec3 wo_l = tf.tolocal(wo); const Vec3 wi_l = tf.tolocal(wi); @@ -1146,7 +971,8 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { } - Sample sample(const Vec3& wo, float rx, float ry, float rz) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { const Vec3 wo_l = tf.tolocal(wo); const float cosNO = wo_l.z; @@ -1221,9 +1047,9 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { } private: - static float SQR(float x) { return x * x; } + static OSL_HOSTDEVICE float SQR(float x) { return x * x; } - float evalLambda(const Vec3 w) const + OSL_HOSTDEVICE float evalLambda(const Vec3 w) const { float cosTheta2 = SQR(w.z); /* Have these two multiplied by sinTheta^2 for convenience */ @@ -1232,15 +1058,18 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2)); } - static float evalG2(float Lambda_i, float Lambda_o) + static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o) { // correlated masking-shadowing return 1 / (Lambda_i + Lambda_o + 1); } - static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); } + static OSL_HOSTDEVICE float evalG1(float Lambda_v) + { + return 1 / (Lambda_v + 1); + } - float evalD(const Vec3 Hr) const + OSL_HOSTDEVICE float evalD(const Vec3 Hr) const { float cosThetaM = Hr.z; if (cosThetaM > 0) { @@ -1252,14 +1081,22 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { float tanThetaM2 = (cosPhi2st2 + sinPhi2st2) / cosThetaM2; - return Distribution::F(tanThetaM2) - / (MxMicrofacetParams::roughness_x - * MxMicrofacetParams::roughness_y * cosThetaM4); + const float val = Distribution::F(tanThetaM2) + / (MxMicrofacetParams::roughness_x + * MxMicrofacetParams::roughness_y + * cosThetaM4); +#ifndef __CUDACC__ + return val; +#else + // Filter out NaNs that can be produced when cosThetaM is very small. + return (val == val) ? val : 0.0; +#endif } return 0; } - Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const + OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu, + float randv) const { /* Project wo and stretch by alpha values */ Vec3 swo = wo; @@ -1297,23 +1134,24 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams { }; struct Reflection final : public BSDF, ReflectionParams { - Reflection(const ReflectionParams& params) - : BSDF(), ReflectionParams(params) + OSL_HOSTDEVICE Reflection(const ReflectionParams& params) + : BSDF(REFLECTION_ID), ReflectionParams(params) { } - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { float cosNO = N.dot(wo); if (cosNO > 0) return Color3(fresnel_dielectric(cosNO, eta)); return Color3(1); } - Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override + OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, + const OSL::Vec3& /*wi*/) const { return {}; } - Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, + float /*rz*/) const { // only one direction is possible float cosNO = dot(N, wo); @@ -1327,21 +1165,22 @@ struct Reflection final : public BSDF, ReflectionParams { }; struct Refraction final : public BSDF, RefractionParams { - Refraction(const RefractionParams& params) - : BSDF(), RefractionParams(params) + OSL_HOSTDEVICE Refraction(const RefractionParams& params) + : BSDF(REFRACTION_ID), RefractionParams(params) { } - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { float cosNO = N.dot(wo); return Color3(1 - fresnel_dielectric(cosNO, eta)); } - Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override + OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, + const OSL::Vec3& /*wi*/) const { return {}; } - Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, + float /*rz*/) const { float pdf = std::numeric_limits::infinity(); Vec3 wi; @@ -1351,13 +1190,13 @@ struct Refraction final : public BSDF, RefractionParams { }; struct Transparent final : public BSDF { - Transparent() : BSDF() {} - Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const override + OSL_HOSTDEVICE Transparent() : BSDF(TRANSPARENT_ID) {} + OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const { return {}; } - Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, - float /*rz*/) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/, + float /*rz*/) const { Vec3 wi = -wo; float pdf = std::numeric_limits::infinity(); @@ -1366,14 +1205,14 @@ struct Transparent final : public BSDF { }; struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams { - MxBurleyDiffuse(const MxBurleyDiffuseParams& params) - : BSDF(), MxBurleyDiffuseParams(params) + OSL_HOSTDEVICE MxBurleyDiffuse(const MxBurleyDiffuseParams& params) + : BSDF(MX_BURLEY_DIFFUSE_ID), MxBurleyDiffuseParams(params) { } - Color3 get_albedo(const Vec3& wo) const override { return albedo; } + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { return albedo; } - Sample eval(const Vec3& wo, const Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const { const Vec3 L = wi, V = wo; const Vec3 H = (L + V).normalize(); @@ -1387,7 +1226,8 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams { return { wi, albedo * refL * refV, pdf, 1.0f }; } - Sample sample(const Vec3& wo, float rx, float ry, float rz) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { Vec3 out_dir; float pdf; @@ -1401,9 +1241,12 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams { // To simplify the implementation, the simpler shadowing/masking visibility term below is used: // https://dassaultsystemes-technology.github.io/EnterprisePBRShadingModel/spec-2022x.md.html#components/sheen struct CharlieSheen final : public BSDF, MxSheenParams { - CharlieSheen(const MxSheenParams& params) : BSDF(), MxSheenParams(params) {} + OSL_HOSTDEVICE CharlieSheen(const MxSheenParams& params) + : BSDF(MX_SHEEN_ID), MxSheenParams(params) + { + } - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { const float NdotV = clamp(N.dot(wo), 0.0f, 1.0f); // Rational fit from the Material X project @@ -1417,7 +1260,7 @@ struct CharlieSheen final : public BSDF, MxSheenParams { return clamp(albedo * (r.x / r.y), 0.0f, 1.0f); } - Sample eval(const Vec3& wo, const Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const { const Vec3 L = wi, V = wo; const Vec3 H = (L + V).normalize(); @@ -1437,7 +1280,8 @@ struct CharlieSheen final : public BSDF, MxSheenParams { pdf, 1.0f }; } - Sample sample(const Vec3& wo, float rx, float ry, float rz) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { Vec3 out_dir; float pdf; @@ -1451,20 +1295,20 @@ struct CharlieSheen final : public BSDF, MxSheenParams { // Tizian Zeltner, Brent Burley, Matt Jen-Yuan Chiang - Siggraph 2022 // https://tizianzeltner.com/projects/Zeltner2022Practical/ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams { - ZeltnerBurleySheen(const MxSheenParams& params) - : BSDF(), MxSheenParams(params) + OSL_HOSTDEVICE ZeltnerBurleySheen(const MxSheenParams& params) + : BSDF(MX_SHEEN_ID), MxSheenParams(params) { } #define USE_LTC_SAMPLING 1 - Color3 get_albedo(const Vec3& wo) const override + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { const float NdotV = clamp(N.dot(wo), 1e-5f, 1.0f); return Color3(fetch_ltc(NdotV).z); } - Sample eval(const Vec3& wo, const Vec3& wi) const override + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const { const Vec3 L = wi, V = wo; const float NdotV = clamp(N.dot(V), 0.0f, 1.0f); @@ -1492,7 +1336,8 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams { #endif } - Sample sample(const Vec3& wo, float rx, float ry, float rz) const override + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { #if USE_LTC_SAMPLING == 1 const Vec3 V = wo; @@ -1523,7 +1368,7 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams { } private: - Vec3 fetch_ltc(float NdotV) const + OSL_HOSTDEVICE Vec3 fetch_ltc(float NdotV) const { // To avoid look-up tables, we use a fit of the LTC coefficients derived by Stephen Hill // for the implementation in MaterialX: @@ -1554,153 +1399,208 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams { }; -Color3 -evaluate_layer_opacity(const OSL::ShaderGlobals& sg, - const ClosureColor* closure) +OSL_HOSTDEVICE Color3 +evaluate_layer_opacity(const ShaderGlobalsType& sg, const ClosureColor* closure) { // Null closure, the layer is fully transparent if (closure == nullptr) return Color3(0); - switch (closure->id) { - case ClosureColor::MUL: - return closure->as_mul()->weight - * evaluate_layer_opacity(sg, closure->as_mul()->closure); - case ClosureColor::ADD: - return evaluate_layer_opacity(sg, closure->as_add()->closureA) - + evaluate_layer_opacity(sg, closure->as_add()->closureB); - default: { - const ClosureComponent* comp = closure->as_comp(); - Color3 w = comp->w; - switch (comp->id) { - case MX_LAYER_ID: { - const MxLayerParams* srcparams = comp->as(); - return w - * (evaluate_layer_opacity(sg, srcparams->top) - + evaluate_layer_opacity(sg, srcparams->base)); - } - case REFLECTION_ID: - case FRESNEL_REFLECTION_ID: { - Reflection bsdf(*comp->as()); - return w * bsdf.get_albedo(-sg.I); - } - case MX_DIELECTRIC_ID: { - const MxDielectricParams& params = *comp->as(); - // Transmissive dielectrics are opaque - if (!is_black(params.transmission_tint)) - return Color3(1); - MxMicrofacet mf(params, 1.0f); - return w * mf.get_albedo(-sg.I); - } - case MX_GENERALIZED_SCHLICK_ID: { - const MxGeneralizedSchlickParams& params - = *comp->as(); - // Transmissive dielectrics are opaque - if (!is_black(params.transmission_tint)) - return Color3(1); - MxMicrofacet mf(params, - 1.0f); - return w * mf.get_albedo(-sg.I); + // Non-recursive traversal stack + const int STACK_SIZE = 16; + int stack_idx = 0; + const ClosureColor* ptr_stack[STACK_SIZE]; + Color3 weight_stack[STACK_SIZE]; + Color3 weight = Color3(1.0f); + + while (closure) { + switch (closure->id) { + case ClosureColor::MUL: + weight *= closure->as_mul()->weight; + closure = closure->as_mul()->closure; + break; + case ClosureColor::ADD: + ptr_stack[stack_idx] = closure->as_add()->closureB; + weight_stack[stack_idx++] = weight; + closure = closure->as_add()->closureA; + break; + default: { + const ClosureComponent* comp = closure->as_comp(); + Color3 w = comp->w; + switch (comp->id) { + case MX_LAYER_ID: { + const MxLayerParams* srcparams = comp->as(); + closure = srcparams->top; + ptr_stack[stack_idx] = srcparams->base; + weight_stack[stack_idx++] = weight * w; + break; + } + case REFLECTION_ID: + case FRESNEL_REFLECTION_ID: { + Reflection bsdf(*comp->as()); + weight *= w * bsdf.get_albedo(-sg.I); + closure = nullptr; + break; + } + case MX_DIELECTRIC_ID: { + const MxDielectricParams& params + = *comp->as(); + // Transmissive dielectrics are opaque + if (!is_black(params.transmission_tint)) { + closure = nullptr; + break; + } + MxMicrofacet + mf(params, 1.0f); + weight *= w * mf.get_albedo(-sg.I); + closure = nullptr; + break; + } + case MX_GENERALIZED_SCHLICK_ID: { + const MxGeneralizedSchlickParams& params + = *comp->as(); + // Transmissive dielectrics are opaque + if (!is_black(params.transmission_tint)) { + closure = nullptr; + break; + } + MxMicrofacet + mf(params, 1.0f); + weight *= w * mf.get_albedo(-sg.I); + closure = nullptr; + break; + } + case MX_SHEEN_ID: { + const MxSheenParams& params = *comp->as(); + if (params.mode == 1) { + weight *= w * ZeltnerBurleySheen(params).get_albedo(-sg.I); + } else { + // otherwise, default to old sheen model + weight *= w * CharlieSheen(params).get_albedo(-sg.I); + } + closure = nullptr; + break; + } + default: // Assume unhandled BSDFs are opaque + closure = nullptr; + break; + } } - case MX_SHEEN_ID: { - const MxSheenParams& params = *comp->as(); - if (params.mode == 1) - return w * ZeltnerBurleySheen(params).get_albedo(-sg.I); - // otherwise, default to old sheen model - return w * CharlieSheen(params).get_albedo(-sg.I); } - default: // Assume unhandled BSDFs are opaque - return Color3(1); + if (closure == nullptr && stack_idx > 0) { + closure = ptr_stack[--stack_idx]; + weight = weight_stack[stack_idx]; } } - } - OSL_ASSERT(false && "Layer opacity evaluation failed"); - return Color3(0); + return weight; } -void -process_medium_closure(const OSL::ShaderGlobals& sg, ShadingResult& result, +OSL_HOSTDEVICE void +process_medium_closure(const ShaderGlobalsType& sg, ShadingResult& result, const ClosureColor* closure, const Color3& w) { if (!closure) return; - switch (closure->id) { - case ClosureColor::MUL: { - process_medium_closure(sg, result, closure->as_mul()->closure, - w * closure->as_mul()->weight); - break; - } - case ClosureColor::ADD: { - process_medium_closure(sg, result, closure->as_add()->closureA, w); - process_medium_closure(sg, result, closure->as_add()->closureB, w); - break; - } - case MX_LAYER_ID: { - const ClosureComponent* comp = closure->as_comp(); - const MxLayerParams* params = comp->as(); - Color3 base_w - = w - * (Color3(1) - - clamp(evaluate_layer_opacity(sg, params->top), 0.f, 1.f)); - process_medium_closure(sg, result, params->top, w); - process_medium_closure(sg, result, params->base, base_w); - break; - } - case MX_ANISOTROPIC_VDF_ID: { - const ClosureComponent* comp = closure->as_comp(); - Color3 cw = w * comp->w; - const auto& params = *comp->as(); - result.sigma_t = cw * params.extinction; - result.sigma_s = params.albedo * result.sigma_t; - result.medium_g = params.anisotropy; - break; - } - case MX_MEDIUM_VDF_ID: { - const ClosureComponent* comp = closure->as_comp(); - Color3 cw = w * comp->w; - const auto& params = *comp->as(); - result.sigma_t = { -OIIO::fast_log(params.transmission_color.x), - -OIIO::fast_log(params.transmission_color.y), - -OIIO::fast_log(params.transmission_color.z) }; - // NOTE: closure weight scales the extinction parameter - result.sigma_t *= cw / params.transmission_depth; - result.sigma_s = params.albedo * result.sigma_t; - result.medium_g = params.anisotropy; - // TODO: properly track a medium stack here ... - result.refraction_ior = sg.backfacing ? 1.0f / params.ior : params.ior; - result.priority = params.priority; - break; - } - case MX_DIELECTRIC_ID: { - const ClosureComponent* comp = closure->as_comp(); - const auto& params = *comp->as(); - if (!is_black(w * comp->w * params.transmission_tint)) { + + // Non-recursive traversal stack + const int STACK_SIZE = 16; + int stack_idx = 0; + const ClosureColor* ptr_stack[STACK_SIZE]; + Color3 weight_stack[STACK_SIZE]; + Color3 weight = w; + + while (closure) { + switch (closure->id) { + case ClosureColor::MUL: { + weight *= closure->as_mul()->weight; + closure = closure->as_mul()->closure; + break; + } + case ClosureColor::ADD: { + weight_stack[stack_idx] = weight; + ptr_stack[stack_idx++] = closure->as_add()->closureB; + closure = closure->as_add()->closureA; + break; + } + case MX_LAYER_ID: { + const ClosureComponent* comp = closure->as_comp(); + const MxLayerParams* params = comp->as(); + Color3 base_w = weight + * (Color3(1) + - clamp(evaluate_layer_opacity(sg, params->top), + 0.f, 1.f)); + closure = params->top; + ptr_stack[stack_idx] = params->base; + weight_stack[stack_idx++] = weight * base_w; + break; + } + case MX_ANISOTROPIC_VDF_ID: { + const ClosureComponent* comp = closure->as_comp(); + Color3 cw = weight * comp->w; + const auto& params = *comp->as(); + result.sigma_t = cw * params.extinction; + result.sigma_s = params.albedo * result.sigma_t; + result.medium_g = params.anisotropy; + closure = nullptr; + break; + } + case MX_MEDIUM_VDF_ID: { + const ClosureComponent* comp = closure->as_comp(); + Color3 cw = weight * comp->w; + const auto& params = *comp->as(); + result.sigma_t = { -OIIO::fast_log(params.transmission_color.x), + -OIIO::fast_log(params.transmission_color.y), + -OIIO::fast_log(params.transmission_color.z) }; + // NOTE: closure weight scales the extinction parameter + result.sigma_t *= cw / params.transmission_depth; + result.sigma_s = params.albedo * result.sigma_t; + result.medium_g = params.anisotropy; // TODO: properly track a medium stack here ... result.refraction_ior = sg.backfacing ? 1.0f / params.ior : params.ior; + result.priority = params.priority; + closure = nullptr; + break; } - break; - } - case MX_GENERALIZED_SCHLICK_ID: { - const ClosureComponent* comp = closure->as_comp(); - const auto& params = *comp->as(); - if (!is_black(w * comp->w * params.transmission_tint)) { - // TODO: properly track a medium stack here ... - float avg_F0 = clamp((params.f0.x + params.f0.y + params.f0.z) - / 3.0f, - 0.0f, 0.99f); - float sqrt_F0 = sqrtf(avg_F0); - float ior = (1 + sqrt_F0) / (1 - sqrt_F0); - result.refraction_ior = sg.backfacing ? 1.0f / ior : ior; + case MX_DIELECTRIC_ID: { + const ClosureComponent* comp = closure->as_comp(); + const auto& params = *comp->as(); + if (!is_black(weight * comp->w * params.transmission_tint)) { + // TODO: properly track a medium stack here ... + result.refraction_ior = sg.backfacing ? 1.0f / params.ior + : params.ior; + } + closure = nullptr; + break; + } + case MX_GENERALIZED_SCHLICK_ID: { + const ClosureComponent* comp = closure->as_comp(); + const auto& params = *comp->as(); + if (!is_black(weight * comp->w * params.transmission_tint)) { + // TODO: properly track a medium stack here ... + float avg_F0 = clamp((params.f0.x + params.f0.y + params.f0.z) + / 3.0f, + 0.0f, 0.99f); + float sqrt_F0 = sqrtf(avg_F0); + float ior = (1 + sqrt_F0) / (1 - sqrt_F0); + result.refraction_ior = sg.backfacing ? 1.0f / ior : ior; + } + closure = nullptr; + break; + } + default: closure = nullptr; break; + } + if (closure == nullptr && stack_idx > 0) { + closure = ptr_stack[--stack_idx]; + weight = weight_stack[stack_idx]; } - break; - } } } // recursively walk through the closure tree, creating bsdfs as we go -void -process_bsdf_closure(const OSL::ShaderGlobals& sg, ShadingResult& result, +OSL_HOSTDEVICE void +process_bsdf_closure(const ShaderGlobalsType& sg, ShadingResult& result, const ClosureColor* closure, const Color3& w, bool light_only) { @@ -1709,242 +1609,605 @@ process_bsdf_closure(const OSL::ShaderGlobals& sg, ShadingResult& result, static const ustringhash uh_default("default"); if (!closure) return; - switch (closure->id) { - case ClosureColor::MUL: { - Color3 cw = w * closure->as_mul()->weight; - process_bsdf_closure(sg, result, closure->as_mul()->closure, cw, - light_only); - break; - } - case ClosureColor::ADD: { - process_bsdf_closure(sg, result, closure->as_add()->closureA, w, - light_only); - process_bsdf_closure(sg, result, closure->as_add()->closureB, w, - light_only); - break; - } - default: { - const ClosureComponent* comp = closure->as_comp(); - Color3 cw = w * comp->w; - if (comp->id == EMISSION_ID) - result.Le += cw; - else if (comp->id == MX_UNIFORM_EDF_ID) - result.Le += cw * comp->as()->emittance; - else if (!light_only) { - bool ok = false; - switch (comp->id) { - case DIFFUSE_ID: - ok = result.bsdf.add_bsdf>( - cw, *comp->as()); - break; - case OREN_NAYAR_ID: - ok = result.bsdf.add_bsdf( - cw, *comp->as()); - break; - case TRANSLUCENT_ID: - ok = result.bsdf.add_bsdf>( - cw, *comp->as()); - break; - case PHONG_ID: - ok = result.bsdf.add_bsdf(cw, *comp->as()); - break; - case WARD_ID: - ok = result.bsdf.add_bsdf(cw, *comp->as()); - break; - case MICROFACET_ID: { - const MicrofacetParams* mp = comp->as(); - if (mp->dist == uh_ggx) { - switch (mp->refract) { - case 0: - ok = result.bsdf.add_bsdf(cw, *mp); - break; - case 1: - ok = result.bsdf.add_bsdf(cw, *mp); - break; - case 2: - ok = result.bsdf.add_bsdf(cw, *mp); - break; + + // Non-recursive traversal stack + const int STACK_SIZE = 16; + int stack_idx = 0; + const ClosureColor* ptr_stack[STACK_SIZE]; + Color3 weight_stack[STACK_SIZE]; + Color3 weight = w; + + while (closure) { + switch (closure->id) { + case ClosureColor::MUL: { + weight *= closure->as_mul()->weight; + closure = closure->as_mul()->closure; + break; + } + case ClosureColor::ADD: { + ptr_stack[stack_idx] = closure->as_add()->closureB; + weight_stack[stack_idx++] = weight; + closure = closure->as_add()->closureA; + break; + } + default: { + const ClosureComponent* comp = closure->as_comp(); + Color3 cw = weight * comp->w; + closure = nullptr; + if (comp->id == EMISSION_ID) + result.Le += cw; + else if (comp->id == MX_UNIFORM_EDF_ID) + result.Le += cw * comp->as()->emittance; + else if (!light_only) { + bool ok = false; + switch (comp->id) { + case DIFFUSE_ID: + ok = result.bsdf.add_bsdf>( + cw, *comp->as()); + break; + case OREN_NAYAR_ID: + ok = result.bsdf.add_bsdf( + cw, *comp->as()); + break; + case TRANSLUCENT_ID: + ok = result.bsdf.add_bsdf>( + cw, *comp->as()); + break; + case PHONG_ID: + ok = result.bsdf.add_bsdf(cw, + *comp->as()); + break; + case WARD_ID: + ok = result.bsdf.add_bsdf(cw, + *comp->as()); + break; + case MICROFACET_ID: { + const MicrofacetParams* mp = comp->as(); + if (mp->dist == uh_ggx) { + switch (mp->refract) { + case 0: + ok = result.bsdf.add_bsdf(cw, + *mp); + break; + case 1: + ok = result.bsdf.add_bsdf(cw, + *mp); + break; + case 2: + ok = result.bsdf.add_bsdf(cw, + *mp); + break; + } + } else if (mp->dist == uh_beckmann + || mp->dist == uh_default) { + switch (mp->refract) { + case 0: + ok = result.bsdf.add_bsdf( + cw, *mp); + break; + case 1: + ok = result.bsdf.add_bsdf( + cw, *mp); + break; + case 2: + ok = result.bsdf.add_bsdf( + cw, *mp); + break; + } } - } else if (mp->dist == uh_beckmann || mp->dist == uh_default) { - switch (mp->refract) { - case 0: - ok = result.bsdf.add_bsdf(cw, - *mp); - break; - case 1: - ok = result.bsdf.add_bsdf(cw, - *mp); - break; - case 2: - ok = result.bsdf.add_bsdf(cw, - *mp); - break; + break; + } + case REFLECTION_ID: + case FRESNEL_REFLECTION_ID: + ok = result.bsdf.add_bsdf( + cw, *comp->as()); + break; + case REFRACTION_ID: + ok = result.bsdf.add_bsdf( + cw, *comp->as()); + break; + case TRANSPARENT_ID: + ok = result.bsdf.add_bsdf(cw); + break; + case MX_OREN_NAYAR_DIFFUSE_ID: { + const MxOrenNayarDiffuseParams* srcparams + = comp->as(); + if (srcparams->energy_compensation) { + // energy compensation handled by its own BSDF + ok = result.bsdf.add_bsdf( + cw, *srcparams); + } else { + // translate MaterialX parameters into existing closure + OrenNayarParams params = {}; + params.N = srcparams->N; + params.sigma = srcparams->roughness; + ok = result.bsdf.add_bsdf( + cw * srcparams->albedo, params); } + break; } - break; - } - case REFLECTION_ID: - case FRESNEL_REFLECTION_ID: - ok = result.bsdf.add_bsdf( - cw, *comp->as()); - break; - case REFRACTION_ID: - ok = result.bsdf.add_bsdf( - cw, *comp->as()); - break; - case TRANSPARENT_ID: - ok = result.bsdf.add_bsdf(cw); - break; - case MX_OREN_NAYAR_DIFFUSE_ID: { - const MxOrenNayarDiffuseParams* srcparams - = comp->as(); - if (srcparams->energy_compensation) { - // energy compensation handled by its own BSDF - ok = result.bsdf.add_bsdf( - cw, *srcparams); - } else { - // translate MaterialX parameters into existing closure - OrenNayarParams params = {}; - params.N = srcparams->N; - params.sigma = srcparams->roughness; - ok = result.bsdf.add_bsdf(cw * srcparams->albedo, - params); + case MX_BURLEY_DIFFUSE_ID: { + const MxBurleyDiffuseParams& params + = *comp->as(); + ok = result.bsdf.add_bsdf(cw, params); + break; } - break; + case MX_DIELECTRIC_ID: { + const MxDielectricParams& params + = *comp->as(); + if (is_black(params.transmission_tint)) + ok = result.bsdf.add_bsdf< + MxMicrofacet>(cw, params, + 1.0f); + else + ok = result.bsdf.add_bsdf>( + cw, params, result.refraction_ior); + break; + } + case MX_CONDUCTOR_ID: { + const MxConductorParams& params + = *comp->as(); + ok = result.bsdf.add_bsdf>( + cw, params, 1.0f); + break; + }; + case MX_GENERALIZED_SCHLICK_ID: { + const MxGeneralizedSchlickParams& params + = *comp->as(); + if (is_black(params.transmission_tint)) + ok = result.bsdf.add_bsdf< + MxMicrofacet>( + cw, params, 1.0f); + else + ok = result.bsdf.add_bsdf< + MxMicrofacet>( + cw, params, result.refraction_ior); + break; + }; + case MX_TRANSLUCENT_ID: { + const MxTranslucentParams* srcparams + = comp->as(); + DiffuseParams params = {}; + params.N = srcparams->N; + ok = result.bsdf.add_bsdf>(cw * srcparams->albedo, + params); + break; + } + case MX_TRANSPARENT_ID: { + ok = result.bsdf.add_bsdf(cw); + break; + } + case MX_SUBSURFACE_ID: { + // TODO: implement BSSRDF support? + const MxSubsurfaceParams* srcparams + = comp->as(); + DiffuseParams params = {}; + params.N = srcparams->N; + ok = result.bsdf.add_bsdf>(cw * srcparams->albedo, + params); + break; + } + case MX_SHEEN_ID: { + const MxSheenParams& params = *comp->as(); + if (params.mode == 1) + ok = result.bsdf.add_bsdf(cw, + params); + else + ok = result.bsdf.add_bsdf( + cw, params); // default to legacy closure + break; + } + case MX_LAYER_ID: { + const MxLayerParams* srcparams = comp->as(); + Color3 base_w + = weight + * (Color3(1, 1, 1) + - clamp(evaluate_layer_opacity(sg, srcparams->top), + 0.f, 1.f)); + closure = srcparams->top; + weight = cw; + if (!is_black(base_w)) { + ptr_stack[stack_idx] = srcparams->base; + weight_stack[stack_idx++] = base_w; + } + ok = true; + break; + } + case MX_ANISOTROPIC_VDF_ID: + case MX_MEDIUM_VDF_ID: { + // already processed by process_medium_closure + ok = true; + break; + } + } +#ifndef __CUDACC__ + OSL_ASSERT(ok && "Invalid closure invoked in surface shader"); +#else + // TODO: We should never get here, but we sometimes do, e.g. in + // the render-material-layer test. + if (false && !ok) + printf("Invalid closure invoked in surface shader\n"); +#endif } - case MX_BURLEY_DIFFUSE_ID: { - const MxBurleyDiffuseParams& params - = *comp->as(); - ok = result.bsdf.add_bsdf(cw, params); + break; + } + } + if (closure == nullptr && stack_idx > 0) { + closure = ptr_stack[--stack_idx]; + weight = weight_stack[stack_idx]; + } + } +} + +} // anonymous namespace + +OSL_NAMESPACE_ENTER + +OSL_HOSTDEVICE void +process_closure(const ShaderGlobalsType& sg, ShadingResult& result, + const ClosureColor* Ci, bool light_only) +{ + if (!light_only) + process_medium_closure(sg, result, Ci, Color3(1)); + process_bsdf_closure(sg, result, Ci, Color3(1), light_only); +} + +OSL_HOSTDEVICE Vec3 +process_background_closure(const ClosureColor* closure) +{ + if (!closure) + return Vec3(0, 0, 0); + + // Non-recursive traversal stack + const int STACK_SIZE = 16; + int stack_idx = 0; + const ClosureColor* ptr_stack[STACK_SIZE]; + Color3 weight_stack[STACK_SIZE]; + Color3 weight = Color3(1.0f); + + while (closure) { + switch (closure->id) { + case ClosureColor::MUL: { + weight *= closure->as_mul()->weight; + closure = closure->as_mul()->closure; + break; + } + case ClosureColor::ADD: { + ptr_stack[stack_idx] = closure->as_add()->closureB; + weight_stack[stack_idx++] = weight; + closure = closure->as_add()->closureA; + break; + } + case BACKGROUND_ID: { + weight *= closure->as_comp()->w; + closure = nullptr; + break; + } + } + if (closure == nullptr && stack_idx > 0) { + closure = ptr_stack[--stack_idx]; + weight = weight_stack[stack_idx]; + } + } + return weight; +} + + +typedef MxMicrofacet + MxConductor; +typedef MxMicrofacet + MxDielectric; +typedef MxMicrofacet + MxDielectricOpaque; +typedef MxMicrofacet + MxGeneralizedSchlick; +typedef MxMicrofacet + MxGeneralizedSchlickOpaque; + +OSL_HOSTDEVICE Color3 +CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const +{ + static const ustringhash uh_ggx("ggx"); + static const ustringhash uh_beckmann("beckmann"); + static const ustringhash uh_default("default"); + + Color3 albedo(0); + switch (bsdf->id) { + case DIFFUSE_ID: + albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo); + break; + case TRANSPARENT_ID: + case MX_TRANSPARENT_ID: + albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo); + break; + case OREN_NAYAR_ID: + albedo = BSDF_CAST(OrenNayar, bsdf)->get_albedo(wo); + break; + case TRANSLUCENT_ID: + albedo = BSDF_CAST(Diffuse<1>, bsdf)->get_albedo(wo); + break; + case PHONG_ID: albedo = BSDF_CAST(Phong, bsdf)->get_albedo(wo); break; + case WARD_ID: albedo = BSDF_CAST(Ward, bsdf)->get_albedo(wo); break; + case REFLECTION_ID: + case FRESNEL_REFLECTION_ID: + albedo = BSDF_CAST(Reflection, bsdf)->get_albedo(wo); + break; + case REFRACTION_ID: + albedo = BSDF_CAST(Refraction, bsdf)->get_albedo(wo); + break; + case MICROFACET_ID: { + const int refract = ((MicrofacetBeckmannRefl*)bsdf)->refract; + const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist; + if (dist == uh_default || dist == uh_beckmann) { + switch (refract) { + case 0: + albedo = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->get_albedo(wo); break; - } - case MX_DIELECTRIC_ID: { - const MxDielectricParams& params - = *comp->as(); - if (is_black(params.transmission_tint)) - ok = result.bsdf.add_bsdf< - MxMicrofacet>( - cw, params, 1.0f); - else - ok = result.bsdf.add_bsdf< - MxMicrofacet>( - cw, params, result.refraction_ior); + case 1: + albedo = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->get_albedo(wo); + break; + case 2: + albedo = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->get_albedo(wo); break; } - case MX_CONDUCTOR_ID: { - const MxConductorParams& params = *comp->as(); - ok = result.bsdf.add_bsdf< - MxMicrofacet>(cw, params, - 1.0f); + } else if (dist == uh_ggx) { + switch (refract) { + case 0: + albedo = BSDF_CAST(MicrofacetGGXRefl, bsdf)->get_albedo(wo); break; - }; - case MX_GENERALIZED_SCHLICK_ID: { - const MxGeneralizedSchlickParams& params - = *comp->as(); - if (is_black(params.transmission_tint)) - ok = result.bsdf.add_bsdf>(cw, params, - 1.0f); - else - ok = result.bsdf.add_bsdf< - MxMicrofacet>( - cw, params, result.refraction_ior); + case 1: + albedo = BSDF_CAST(MicrofacetGGXRefr, bsdf)->get_albedo(wo); break; - }; - case MX_TRANSLUCENT_ID: { - const MxTranslucentParams* srcparams - = comp->as(); - DiffuseParams params = {}; - params.N = srcparams->N; - ok = result.bsdf.add_bsdf>(cw * srcparams->albedo, - params); + case 2: + albedo = BSDF_CAST(MicrofacetGGXBoth, bsdf)->get_albedo(wo); break; } - case MX_TRANSPARENT_ID: { - ok = result.bsdf.add_bsdf(cw); + } + break; + } + case MX_CONDUCTOR_ID: + albedo = BSDF_CAST(MxConductor, bsdf)->get_albedo(wo); + break; + case MX_DIELECTRIC_ID: + if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint)) + albedo = BSDF_CAST(MxDielectricOpaque, bsdf)->get_albedo(wo); + else + albedo = BSDF_CAST(MxDielectric, bsdf)->get_albedo(wo); + break; + case MX_OREN_NAYAR_DIFFUSE_ID: + albedo = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->get_albedo(wo); + break; + case MX_BURLEY_DIFFUSE_ID: + albedo = BSDF_CAST(MxBurleyDiffuse, bsdf)->get_albedo(wo); + break; + case MX_SHEEN_ID: + if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1) + albedo = BSDF_CAST(ZeltnerBurleySheen, bsdf)->get_albedo(wo); + else + albedo = BSDF_CAST(CharlieSheen, bsdf)->get_albedo(wo); + break; + case MX_GENERALIZED_SCHLICK_ID: { + const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint; + if (is_black(tint)) + albedo = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->get_albedo(wo); + else + albedo = BSDF_CAST(MxGeneralizedSchlick, bsdf)->get_albedo(wo); + break; + } + default: break; + } + return albedo; +} + + +OSL_HOSTDEVICE BSDF::Sample +CompositeBSDF::sample(const BSDF* bsdf, const Vec3& wo, float rx, float ry, + float rz) const +{ + static const ustringhash uh_ggx("ggx"); + static const ustringhash uh_beckmann("beckmann"); + static const ustringhash uh_default("default"); + + BSDF::Sample sample = {}; + switch (bsdf->id) { + case DIFFUSE_ID: + sample = BSDF_CAST(Diffuse<0>, bsdf)->sample(wo, rx, ry, rz); + break; + case TRANSPARENT_ID: + case MX_TRANSPARENT_ID: + sample = BSDF_CAST(Transparent, bsdf)->sample(wo, rx, ry, rz); + break; + case OREN_NAYAR_ID: + sample = BSDF_CAST(OrenNayar, bsdf)->sample(wo, rx, ry, rz); + break; + case TRANSLUCENT_ID: + sample = BSDF_CAST(Diffuse<1>, bsdf)->sample(wo, rx, ry, rz); + break; + case PHONG_ID: + sample = BSDF_CAST(Phong, bsdf)->sample(wo, rx, ry, rz); + break; + case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->sample(wo, rx, ry, rz); break; + case REFLECTION_ID: + case FRESNEL_REFLECTION_ID: + sample = BSDF_CAST(Reflection, bsdf)->sample(wo, rx, ry, rz); + break; + case REFRACTION_ID: + sample = BSDF_CAST(Refraction, bsdf)->sample(wo, rx, ry, rz); + break; + case MICROFACET_ID: { + const int refract = ((MicrofacetBeckmannRefl*)bsdf)->refract; + const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist; + if (dist == uh_default || dist == uh_beckmann) { + switch (refract) { + case 0: + sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf) + ->sample(wo, rx, ry, rz); break; - } - case MX_SUBSURFACE_ID: { - // TODO: implement BSSRDF support? - const MxSubsurfaceParams* srcparams - = comp->as(); - DiffuseParams params = {}; - params.N = srcparams->N; - ok = result.bsdf.add_bsdf>(cw * srcparams->albedo, - params); + case 1: + sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf) + ->sample(wo, rx, ry, rz); break; - } - case MX_SHEEN_ID: { - const MxSheenParams& params = *comp->as(); - if (params.mode == 1) - ok = result.bsdf.add_bsdf(cw, params); - else - ok = result.bsdf.add_bsdf( - cw, params); // default to legacy closure + case 2: + sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf) + ->sample(wo, rx, ry, rz); break; } - case MX_LAYER_ID: { - const MxLayerParams* srcparams = comp->as(); - Color3 base_w - = w - * (Color3(1, 1, 1) - - clamp(evaluate_layer_opacity(sg, srcparams->top), - 0.f, 1.f)); - process_bsdf_closure(sg, result, srcparams->top, w, light_only); - if (!is_black(base_w)) - process_bsdf_closure(sg, result, srcparams->base, base_w, - light_only); - ok = true; + } else if (dist == uh_ggx) { + switch (refract) { + case 0: + sample + = BSDF_CAST(MicrofacetGGXRefl, bsdf)->sample(wo, rx, ry, rz); break; - } - case MX_ANISOTROPIC_VDF_ID: - case MX_MEDIUM_VDF_ID: { - // already processed by process_medium_closure - ok = true; + case 1: + sample + = BSDF_CAST(MicrofacetGGXRefr, bsdf)->sample(wo, rx, ry, rz); + break; + case 2: + sample + = BSDF_CAST(MicrofacetGGXBoth, bsdf)->sample(wo, rx, ry, rz); break; } - } - OSL_ASSERT(ok && "Invalid closure invoked in surface shader"); } break; } + case MX_CONDUCTOR_ID: + sample = BSDF_CAST(MxConductor, bsdf)->sample(wo, rx, ry, rz); + break; + case MX_DIELECTRIC_ID: + if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint)) + sample = BSDF_CAST(MxDielectricOpaque, bsdf)->sample(wo, rx, ry, rz); + else + sample = BSDF_CAST(MxDielectric, bsdf)->sample(wo, rx, ry, rz); + break; + case MX_BURLEY_DIFFUSE_ID: + sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->sample(wo, rx, ry, rz); + break; + case MX_OREN_NAYAR_DIFFUSE_ID: + sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf) + ->sample(wo, rx, ry, rz); + break; + case MX_SHEEN_ID: + if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1) + sample = BSDF_CAST(ZeltnerBurleySheen, bsdf)->sample(wo, rx, ry, rz); + else + sample = BSDF_CAST(CharlieSheen, bsdf)->sample(wo, rx, ry, rz); + break; + case MX_GENERALIZED_SCHLICK_ID: { + const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint; + if (is_black(tint)) { + sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf) + ->sample(wo, rx, ry, rz); + } else { + sample + = BSDF_CAST(MxGeneralizedSchlick, bsdf)->sample(wo, rx, ry, rz); + } + break; + } + default: break; } + return sample; } -} // anonymous namespace -OSL_NAMESPACE_ENTER - -void -process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result, - const ClosureColor* Ci, bool light_only) +OSL_HOSTDEVICE BSDF::Sample +CompositeBSDF::eval(const BSDF* bsdf, const Vec3& wo, const Vec3& wi) const { - if (!light_only) - process_medium_closure(sg, result, Ci, Color3(1)); - process_bsdf_closure(sg, result, Ci, Color3(1), light_only); -} + static const ustringhash uh_ggx("ggx"); + static const ustringhash uh_beckmann("beckmann"); + static const ustringhash uh_default("default"); -Vec3 -process_background_closure(const ClosureColor* closure) -{ - if (!closure) - return Vec3(0, 0, 0); - switch (closure->id) { - case ClosureColor::MUL: { - return closure->as_mul()->weight - * process_background_closure(closure->as_mul()->closure); - } - case ClosureColor::ADD: { - return process_background_closure(closure->as_add()->closureA) - + process_background_closure(closure->as_add()->closureB); + BSDF::Sample sample = {}; + switch (bsdf->id) { + case DIFFUSE_ID: sample = BSDF_CAST(Diffuse<0>, bsdf)->eval(wo, wi); break; + case TRANSPARENT_ID: + case MX_TRANSPARENT_ID: + sample = BSDF_CAST(Transparent, bsdf)->eval(wo, wi); + break; + case OREN_NAYAR_ID: + sample = BSDF_CAST(OrenNayar, bsdf)->eval(wo, wi); + break; + case TRANSLUCENT_ID: + sample = BSDF_CAST(Diffuse<1>, bsdf)->eval(wo, wi); + break; + case PHONG_ID: sample = BSDF_CAST(Phong, bsdf)->eval(wo, wi); break; + case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->eval(wo, wi); break; + case REFLECTION_ID: + case FRESNEL_REFLECTION_ID: + sample = BSDF_CAST(Reflection, bsdf)->eval(wo, wi); + break; + case REFRACTION_ID: + sample = BSDF_CAST(Refraction, bsdf)->eval(wo, wi); + break; + case MICROFACET_ID: { + const int refract = ((MicrofacetBeckmannRefl*)bsdf)->refract; + const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist; + if (dist == uh_default || dist == uh_beckmann) { + switch (refract) { + case 0: + sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->eval(wo, wi); + break; + case 1: + sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->eval(wo, wi); + break; + case 2: + sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->eval(wo, wi); + break; + } + } else if (dist == uh_ggx) { + switch (refract) { + case 0: + sample = BSDF_CAST(MicrofacetGGXRefl, bsdf)->eval(wo, wi); + break; + case 1: + sample = BSDF_CAST(MicrofacetGGXRefr, bsdf)->eval(wo, wi); + break; + case 2: + sample = BSDF_CAST(MicrofacetGGXBoth, bsdf)->eval(wo, wi); + break; + } + } + break; } - case BACKGROUND_ID: { - return closure->as_comp()->w; + case MX_CONDUCTOR_ID: + sample = BSDF_CAST(MxConductor, bsdf)->eval(wo, wi); + break; + case MX_DIELECTRIC_ID: + if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint)) + sample = BSDF_CAST(MxDielectricOpaque, bsdf)->eval(wo, wi); + else + sample = BSDF_CAST(MxDielectric, bsdf)->eval(wo, wi); + break; + case MX_BURLEY_DIFFUSE_ID: + sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->eval(wo, wi); + break; + case MX_OREN_NAYAR_DIFFUSE_ID: + sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->eval(wo, wi); + break; + case MX_SHEEN_ID: + if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1) + sample = BSDF_CAST(ZeltnerBurleySheen, bsdf)->eval(wo, wi); + else + sample = BSDF_CAST(CharlieSheen, bsdf)->eval(wo, wi); + break; + case MX_GENERALIZED_SCHLICK_ID: { + const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint; + if (is_black(tint)) { + sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->eval(wo, wi); + } else { + sample = BSDF_CAST(MxGeneralizedSchlick, bsdf)->eval(wo, wi); + } + break; } + default: break; } - // should never happen - OSL_ASSERT(false && "Invalid closure invoked in background shader"); - return Vec3(0, 0, 0); + return sample; } diff --git a/src/testrender/shading.h b/src/testrender/shading.h index d7fdb190d..729cd47eb 100644 --- a/src/testrender/shading.h +++ b/src/testrender/shading.h @@ -6,24 +6,256 @@ #pragma once #include +#include #include #include #include +#include "optics.h" #include "sampling.h" OSL_NAMESPACE_ENTER + +enum ClosureIDs { + ADD = -2, + MUL = -1, + COMPONENT_BASE_ID = 0, + EMISSION_ID = 1, + BACKGROUND_ID, + DIFFUSE_ID, + OREN_NAYAR_ID, + TRANSLUCENT_ID, + PHONG_ID, + WARD_ID, + MICROFACET_ID, + REFLECTION_ID, + FRESNEL_REFLECTION_ID, + REFRACTION_ID, + TRANSPARENT_ID, + DEBUG_ID, + HOLDOUT_ID, + // See MATERIALX_CLOSURES in stdosl.h + MX_OREN_NAYAR_DIFFUSE_ID, + MX_BURLEY_DIFFUSE_ID, + MX_DIELECTRIC_ID, + MX_CONDUCTOR_ID, + MX_GENERALIZED_SCHLICK_ID, + MX_TRANSLUCENT_ID, + MX_TRANSPARENT_ID, + MX_SUBSURFACE_ID, + MX_SHEEN_ID, + MX_UNIFORM_EDF_ID, + MX_ANISOTROPIC_VDF_ID, + MX_MEDIUM_VDF_ID, + MX_LAYER_ID, + // TODO: adding vdfs would require extending testrender with volume support ... + EMPTY_ID +}; + + +namespace { // anonymous namespace + +// these structures hold the parameters of each closure type +// they will be contained inside ClosureComponent +struct EmptyParams {}; +struct DiffuseParams { + Vec3 N; +}; +struct OrenNayarParams { + Vec3 N; + float sigma; +}; +struct PhongParams { + Vec3 N; + float exponent; +}; +struct WardParams { + Vec3 N, T; + float ax, ay; +}; +struct ReflectionParams { + Vec3 N; + float eta; +}; +struct RefractionParams { + Vec3 N; + float eta; +}; +struct MicrofacetParams { + ustringhash dist; + Vec3 N, U; + float xalpha, yalpha, eta; + int refract; +}; + +// MATERIALX_CLOSURES + +struct MxOrenNayarDiffuseParams { + Vec3 N; + Color3 albedo; + float roughness; + // optional + ustringhash label; + int energy_compensation; +}; + +struct MxBurleyDiffuseParams { + Vec3 N; + Color3 albedo; + float roughness; + // optional + ustringhash label; +}; + +// common to all MaterialX microfacet closures +struct MxMicrofacetBaseParams { + Vec3 N, U; + float roughness_x; + float roughness_y; + ustringhash distribution; + // optional + ustringhash label; +}; + +struct MxDielectricParams : public MxMicrofacetBaseParams { + Color3 reflection_tint; + Color3 transmission_tint; + float ior; + // optional + float thinfilm_thickness; + float thinfilm_ior; + + OSL_HOSTDEVICE Color3 evalR(float cos_theta) const + { + return reflection_tint * fresnel_dielectric(cos_theta, ior); + } + + OSL_HOSTDEVICE Color3 evalT(float cos_theta) const + { + return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior)); + } +}; + +struct MxConductorParams : public MxMicrofacetBaseParams { + Color3 ior; + Color3 extinction; + // optional + float thinfilm_thickness; + float thinfilm_ior; + + OSL_HOSTDEVICE Color3 evalR(float cos_theta) const + { + return fresnel_conductor(cos_theta, ior, extinction); + } + + OSL_HOSTDEVICE Color3 evalT(float cos_theta) const { return Color3(0.0f); } + + // Avoid function was declared but never referenced + // float get_ior() const + // { + // return 0; // no transmission possible + // } +}; + +struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams { + Color3 reflection_tint; + Color3 transmission_tint; + Color3 f0; + Color3 f90; + float exponent; + // optional + float thinfilm_thickness; + float thinfilm_ior; + + OSL_HOSTDEVICE Color3 evalR(float cos_theta) const + { + return reflection_tint + * fresnel_generalized_schlick(cos_theta, f0, f90, exponent); + } + + OSL_HOSTDEVICE Color3 evalT(float cos_theta) const + { + return transmission_tint + * (Color3(1.0f) + - fresnel_generalized_schlick(cos_theta, f0, f90, exponent)); + } +}; + +struct MxTranslucentParams { + Vec3 N; + Color3 albedo; + // optional + ustringhash label; +}; + +struct MxSubsurfaceParams { + Vec3 N; + Color3 albedo; + Color3 radius; + float anisotropy; + // optional + ustringhash label; +}; + +struct MxSheenParams { + Vec3 N; + Color3 albedo; + float roughness; + // optional + int mode; + ustringhash label; +}; + +struct MxUniformEdfParams { + Color3 emittance; + // optional + ustringhash label; +}; + +struct MxLayerParams { + OSL::ClosureColor* top; + OSL::ClosureColor* base; +}; + +struct MxAnisotropicVdfParams { + Color3 albedo; + Color3 extinction; + float anisotropy; + // optional + ustringhash label; +}; + +struct MxMediumVdfParams { + Color3 albedo; + float transmission_depth; + Color3 transmission_color; + float anisotropy; + float ior; + int priority; + // optional + ustringhash label; +}; + +} // anonymous namespace + + +// Cast a BSDF* to the specified sub-type +#define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast(bsdf) + /// Individual BSDF (diffuse, phong, refraction, etc ...) /// Actual implementations of this class are private struct BSDF { struct Sample { - Sample() : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f) {} - Sample(const Sample& o) + OSL_HOSTDEVICE Sample() + : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f) + { + } + OSL_HOSTDEVICE Sample(const Sample& o) : wi(o.wi), weight(o.weight), pdf(o.pdf), roughness(o.roughness) { } - Sample(Vec3 wi, Color3 w, float pdf, float r) + OSL_HOSTDEVICE Sample(Vec3 wi, Color3 w, float pdf, float r) : wi(wi), weight(w), pdf(pdf), roughness(r) { } @@ -32,48 +264,82 @@ struct BSDF { float pdf; float roughness; }; - BSDF() {} - virtual Color3 get_albedo(const Vec3& /*wo*/) const { return Color3(1); } - virtual Sample eval(const Vec3& wo, const Vec3& wi) const = 0; - virtual Sample sample(const Vec3& wo, float rx, float ry, float rz) const - = 0; + OSL_HOSTDEVICE BSDF(ClosureIDs id = EMPTY_ID) : id(id) {} + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& /*wo*/) const + { + return Color3(1); + } + OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const + { + return {}; + } + OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry, + float rz) const + { + return {}; + } + ClosureIDs id; +#ifdef __CUDACC__ + // TODO: This is a total hack to avoid a misaligned address error + // that sometimes occurs with the EnergyCompensatedOrenNayar BSDF. + // It's not clear what the issue is or why this fixes it, but that + // will take a bit of digging. + int pad; +#endif }; /// Represents a weighted sum of BSDFS /// NOTE: no need to inherit from BSDF here because we use a "flattened" representation and therefore never nest these /// struct CompositeBSDF { - CompositeBSDF() : num_bsdfs(0), num_bytes(0) {} + OSL_HOSTDEVICE CompositeBSDF() : num_bsdfs(0), num_bytes(0) {} - void prepare(const Vec3& wo, const Color3& path_weight, bool absorb) + OSL_HOSTDEVICE void prepare(const Vec3& wo, const Color3& path_weight, + bool absorb) { float total = 0; for (int i = 0; i < num_bsdfs; i++) { - pdfs[i] = weights[i].dot(path_weight * bsdfs[i]->get_albedo(wo)) + pdfs[i] = weights[i].dot(path_weight * get_albedo(bsdfs[i], wo)) / (path_weight.x + path_weight.y + path_weight.z); +#ifndef __CUDACC__ + // TODO: Figure out what to do with weights/albedos with negative + // components (e.g., as might happen when bipolar noise is + // used as a color). + + // The PDF is out-of-range in some test scenes on the CPU path, but + // these asserts are no-ops in release builds. The asserts are active + // on the CUDA path, so we need to skip them. assert(pdfs[i] >= 0); assert(pdfs[i] <= 1); +#endif total += pdfs[i]; } if ((!absorb && total > 0) || total > 1) { - for (int i = 0; i < num_bsdfs; i++) + for (int i = 0; i < num_bsdfs; i++) { +#ifndef __CUDACC__ pdfs[i] /= total; +#else + // TODO: This helps avoid NaNs, but it's not clear where the + // NaNs are coming from. + pdfs[i] = __fdiv_rz(pdfs[i], total); +#endif + } } } - Color3 get_albedo(const Vec3& wo) const + OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { Color3 result(0, 0, 0); for (int i = 0; i < num_bsdfs; i++) - result += weights[i] * bsdfs[i]->get_albedo(wo); + result += weights[i] * get_albedo(bsdfs[i], wo); return result; } - BSDF::Sample eval(const Vec3& wo, const Vec3& wi) const + OSL_HOSTDEVICE BSDF::Sample eval(const Vec3& wo, const Vec3& wi) const { BSDF::Sample s = {}; for (int i = 0; i < num_bsdfs; i++) { - BSDF::Sample b = bsdfs[i]->eval(wo, wi); + BSDF::Sample b = eval(bsdfs[i], wo, wi); b.weight *= weights[i]; MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf, pdfs[i]); s.roughness += b.roughness * pdfs[i]; @@ -81,14 +347,15 @@ struct CompositeBSDF { return s; } - BSDF::Sample sample(const Vec3& wo, float rx, float ry, float rz) const + OSL_HOSTDEVICE BSDF::Sample sample(const Vec3& wo, float rx, float ry, + float rz) const { float accum = 0; for (int i = 0; i < num_bsdfs; i++) { if (rx < (pdfs[i] + accum)) { rx = (rx - accum) / pdfs[i]; rx = std::min(rx, 0.99999994f); // keep result in [0,1) - BSDF::Sample s = bsdfs[i]->sample(wo, rx, ry, rz); + BSDF::Sample s = sample(bsdfs[i], wo, rx, ry, rz); s.weight *= weights[i] * (1 / pdfs[i]); s.pdf *= pdfs[i]; if (s.pdf == 0.0f) @@ -96,7 +363,7 @@ struct CompositeBSDF { // we sampled PDF i, now figure out how much the other bsdfs contribute to the chosen direction for (int j = 0; j < num_bsdfs; j++) { if (i != j) { - BSDF::Sample b = bsdfs[j]->eval(wo, s.wi); + BSDF::Sample b = eval(bsdfs[j], wo, s.wi); b.weight *= weights[j]; MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf, pdfs[j]); @@ -110,7 +377,7 @@ struct CompositeBSDF { } template - bool add_bsdf(const Color3& w, BSDF_Args&&... args) + OSL_HOSTDEVICE bool add_bsdf(const Color3& w, BSDF_Args&&... args) { // make sure we have enough space if (num_bsdfs >= MaxEntries) @@ -127,8 +394,14 @@ struct CompositeBSDF { private: /// Never try to copy this struct because it would invalidate the bsdf pointers - CompositeBSDF(const CompositeBSDF& c); - CompositeBSDF& operator=(const CompositeBSDF& c); + OSL_HOSTDEVICE CompositeBSDF(const CompositeBSDF& c); + OSL_HOSTDEVICE CompositeBSDF& operator=(const CompositeBSDF& c); + + OSL_HOSTDEVICE Color3 get_albedo(const BSDF* bsdf, const Vec3& wo) const; + OSL_HOSTDEVICE BSDF::Sample eval(const BSDF* bsdf, const Vec3& wo, + const Vec3& wi) const; + OSL_HOSTDEVICE BSDF::Sample sample(const BSDF* bsdf, const Vec3& wo, + float rx, float ry, float rz) const; enum { MaxEntries = 8 }; enum { MaxSize = 256 * sizeof(float) }; @@ -153,10 +426,10 @@ struct ShadingResult { void register_closures(ShadingSystem* shadingsys); -void +OSL_HOSTDEVICE void process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result, const ClosureColor* Ci, bool light_only); -Vec3 +OSL_HOSTDEVICE Vec3 process_background_closure(const ClosureColor* Ci); OSL_NAMESPACE_EXIT diff --git a/src/testrender/simpleraytracer.cpp b/src/testrender/simpleraytracer.cpp index f62196f93..7ad197547 100644 --- a/src/testrender/simpleraytracer.cpp +++ b/src/testrender/simpleraytracer.cpp @@ -2,20 +2,24 @@ // SPDX-License-Identifier: BSD-3-Clause // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage -#include -#include -#include -#include +#ifndef __CUDACC__ +# include +# include +# include +# include -#ifdef USING_OIIO_PUGI +# ifdef USING_OIIO_PUGI namespace pugi = OIIO::pugi; +# endif + +# include +# include "raytracer.h" +# include "shading.h" +# include "simpleraytracer.h" #endif -#include -#include "raytracer.h" -#include "shading.h" -#include "simpleraytracer.h" +#include // Create ustrings for all strings used by the free function renderer services. // Required to allow the reverse mapping of hash->string to work when processing messages @@ -42,6 +46,15 @@ using namespace OSL; OSL_NAMESPACE_ENTER + +#ifndef __CUDACC__ +using ShaderGlobalsType = OSL::ShaderGlobals; +#else +using ShaderGlobalsType = OSL_CUDA::ShaderGlobals; +#endif + + +#ifndef __CUDACC__ static TypeDesc TypeFloatArray2(TypeDesc::FLOAT, 2); static TypeDesc TypeFloatArray4(TypeDesc::FLOAT, 4); static TypeDesc TypeIntArray2(TypeDesc::INT, 2); @@ -844,31 +857,40 @@ SimpleRaytracer::get_camera_screen_window(ShaderGlobals* /*sg*/, bool derivs, } return false; } +#endif // #ifndef __CUDACC__ -void -SimpleRaytracer::globals_from_hit(ShaderGlobals& sg, const Ray& r, +void OSL_HOSTDEVICE +SimpleRaytracer::globals_from_hit(ShaderGlobalsType& sg, const Ray& r, const Dual2& t, int id, float u, float v) { +#ifndef __CUDACC__ memset((char*)&sg, 0, sizeof(ShaderGlobals)); - Dual2 P = r.point(t); - // We are missing the projection onto the surface here - sg.P = P.val(); - sg.dPdx = P.dx(); - sg.dPdy = P.dy(); - sg.N = scene.normal(P, sg.Ng, id, u, v); - Dual2 uv = scene.uv(P, sg.N, sg.dPdu, sg.dPdv, id, u, v); - sg.u = uv.val().x; - sg.dudx = uv.dx().x; - sg.dudy = uv.dy().x; - sg.v = uv.val().y; - sg.dvdx = uv.dx().y; - sg.dvdy = uv.dy().y; +#endif + +#ifndef __CUDACC__ const int meshid = std::upper_bound(scene.last_index.begin(), scene.last_index.end(), id) - scene.last_index.begin(); +#else + const int meshid = m_meshids[id]; +#endif + + Dual2 P = r.point(t); + // We are missing the projection onto the surface here + sg.P = P.val(); + sg.dPdx = P.dx(); + sg.dPdy = P.dy(); + sg.N = scene.normal(P, sg.Ng, id, u, v); + Dual2 uv = scene.uv(P, sg.N, sg.dPdu, sg.dPdv, id, u, v); + sg.u = uv.val().x; + sg.dudx = uv.dx().x; + sg.dudy = uv.dy().x; + sg.v = uv.val().y; + sg.dvdx = uv.dx().y; + sg.dvdy = uv.dy().y; sg.surfacearea = m_mesh_surfacearea[meshid]; Dual2 direction = r.dual_direction(); sg.I = direction.val(); @@ -882,37 +904,55 @@ SimpleRaytracer::globals_from_hit(ShaderGlobals& sg, const Ray& r, sg.raytype = r.raytype; sg.flipHandedness = sg.dPdx.cross(sg.dPdy).dot(sg.N) < 0; +#ifndef __CUDACC__ // In our SimpleRaytracer, the "renderstate" itself just a pointer to // the ShaderGlobals. sg.renderstate = &sg; +#endif } -Vec3 + + +OSL_HOSTDEVICE Vec3 SimpleRaytracer::eval_background(const Dual2& dir, ShadingContext* ctx, int bounce) { - ShaderGlobals sg; + ShaderGlobalsType sg; memset((char*)&sg, 0, sizeof(ShaderGlobals)); sg.I = dir.val(); sg.dIdx = dir.dx(); sg.dIdy = dir.dy(); if (bounce >= 0) sg.raytype = bounce > 0 ? Ray::DIFFUSE : Ray::CAMERA; +#ifndef __CUDACC__ shadingsys->execute(*ctx, *m_shaders[backgroundShaderID], sg); - return process_background_closure(sg.Ci); +#else + alignas(8) char closure_pool[256]; + execute_shader(sg, render_params.bg_id, closure_pool); +#endif + return process_background_closure((const ClosureColor*)sg.Ci); } Color3 SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, ShadingContext* ctx) { +#ifdef __CUDACC__ + // Scratch space for the output closures + alignas(8) char closure_pool[256]; + alignas(8) char light_closure_pool[256]; +#endif + constexpr float inf = std::numeric_limits::infinity(); Ray r = camera.get(x, y); Color3 path_weight(1, 1, 1); Color3 path_radiance(0, 0, 0); int prev_id = -1; float bsdf_pdf = inf; // camera ray has only one possible direction + for (int b = 0; b <= max_bounces; b++) { + ShaderGlobalsType sg; + // trace the ray against the scene Intersection hit = scene.intersect(r, inf, prev_id); if (hit.t == inf) { @@ -935,8 +975,8 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, } // construct a shader globals for the hit point - ShaderGlobals sg; globals_from_hit(sg, r, hit.t, hit.id, hit.u, hit.v); + if (show_globals) { // visualize the main fields of the shader globals Vec3 v = sg.Ng; @@ -954,21 +994,34 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, path_radiance += path_weight * c; break; } + const float radius = r.radius + r.spread * hit.t; - int shaderID = scene.shaderid(hit.id); + + int shaderID = scene.shaderid(hit.id); + +#ifndef __CUDACC__ if (shaderID < 0 || !m_shaders[shaderID]) break; // no shader attached? done // execute shader and process the resulting list of closures shadingsys->execute(*ctx, *m_shaders[shaderID], sg); +#else + if (shaderID < 0) + break; // no shader attached? done + execute_shader(sg, shaderID, closure_pool); +#endif ShadingResult result; bool last_bounce = b == max_bounces; - process_closure(sg, result, sg.Ci, last_bounce); + process_closure(sg, result, (const ClosureColor*)sg.Ci, last_bounce); + +#ifndef __CUDACC__ + const size_t lightprims_size = m_lightprims.size(); +#endif // add self-emission float k = 1; - if (m_shader_is_light[shaderID]) { - const float light_pick_pdf = 1.0f / m_lightprims.size(); + if (m_shader_is_light[shaderID] && lightprims_size > 0) { + const float light_pick_pdf = 1.0f / lightprims_size; // figure out the probability of reaching this point float light_pdf = light_pick_pdf * scene.shapepdf(hit.id, r.origin, sg.P); @@ -1008,6 +1061,7 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, * MIS::power_heuristic(bg_pdf, b.pdf); if ((contrib.x + contrib.y + contrib.z) > 0) { + ShaderGlobalsType shadow_sg; Ray shadow_ray = Ray(sg.P, bg_dir.val(), radius, 0, Ray::SHADOW); Intersection shadow_hit = scene.intersect(shadow_ray, inf, @@ -1018,17 +1072,18 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, } // trace a shadow ray to one of the light emitting primitives - if (!m_lightprims.empty()) { - const float light_pick_pdf = 1.0f / m_lightprims.size(); + if (lightprims_size > 0) { + const float light_pick_pdf = 1.0f / lightprims_size; // uniform probability for each light - float xl = xi * m_lightprims.size(); + float xl = xi * lightprims_size; int ls = floorf(xl); xl -= ls; uint32_t lid = m_lightprims[ls]; if (lid != hit.id) { int shaderID = scene.shaderid(lid); + // sample a random direction towards the object LightSample sample = scene.sample(lid, sg.P, xl, yi); BSDF::Sample b = result.bsdf.eval(-sg.I, sample.dir); @@ -1036,29 +1091,46 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, * MIS::power_heuristic( light_pick_pdf * sample.pdf, b.pdf); if ((contrib.x + contrib.y + contrib.z) > 0) { + ShaderGlobalsType light_sg; Ray shadow_ray = Ray(sg.P, sample.dir, radius, 0, Ray::SHADOW); // trace a shadow ray and see if we actually hit the target // in this tiny renderer, tracing a ray is probably cheaper than evaluating the light shader Intersection shadow_hit = scene.intersect(shadow_ray, sample.dist, hit.id, lid); - if (shadow_hit.t == sample.dist) { + +#ifndef __CUDACC__ + const bool did_hit = shadow_hit.t == sample.dist; +#else + // The hit distance on the device is not as precise as on + // the CPU, so we need to allow a little wiggle room. An + // epsilon of 1e-3f empirically gives results that closely + // match the CPU for the test scenes, so that's what we're + // using. + const bool did_hit = fabsf(shadow_hit.t - sample.dist) + < 1e-3f; +#endif + if (did_hit) { // setup a shader global for the point on the light - ShaderGlobals light_sg; globals_from_hit(light_sg, shadow_ray, sample.dist, lid, sample.u, sample.v); +#ifndef __CUDACC__ // execute the light shader (for emissive closures only) shadingsys->execute(*ctx, *m_shaders[shaderID], light_sg); +#else + execute_shader(light_sg, shaderID, light_closure_pool); +#endif ShadingResult light_result; - process_closure(light_sg, light_result, light_sg.Ci, - true); + process_closure(light_sg, light_result, + (const ClosureColor*)light_sg.Ci, true); // accumulate contribution path_radiance += contrib * light_result.Le; } } } } + // trace indirect ray and continue BSDF::Sample p = result.bsdf.sample(-sg.I, xi, yi, zi); path_weight *= p.weight; @@ -1077,14 +1149,15 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, return path_radiance; } -Color3 + +OSL_HOSTDEVICE Color3 SimpleRaytracer::antialias_pixel(int x, int y, ShadingContext* ctx) { Color3 result(0, 0, 0); for (int si = 0, n = aa * aa; si < n; si++) { Sampler sampler(x, y, si); // jitter pixel coordinate [0,1)^2 - Vec3 j = sampler.get(); + Vec3 j = no_jitter ? Vec3(0.5f, 0.5f, 0) : sampler.get(); // warp distribution to approximate a tent filter [-1,+1)^2 j.x *= 2; j.x = j.x < 1 ? sqrtf(j.x) - 1 : 1 - sqrtf(2 - j.x); @@ -1100,11 +1173,13 @@ SimpleRaytracer::antialias_pixel(int x, int y, ShadingContext* ctx) } +#ifndef __CUDACC__ void SimpleRaytracer::prepare_render() { // Retrieve and validate options aa = std::max(1, options.get_int("aa")); + no_jitter = options.get_int("no_jitter") != 0; max_bounces = options.get_int("max_bounces"); rr_depth = options.get_int("rr_depth"); show_albedo_scale = options.get_float("show_albedo_scale"); @@ -1132,7 +1207,25 @@ SimpleRaytracer::prepare_render() // build bvh and prepare triangles scene.prepare(errhandler()); + prepare_lights(); + +# if 0 + // dump scene to disk as obj for debugging purposes + // TODO: make this a feature? + FILE* fp = fopen("/tmp/test.obj", "w"); + for (Vec3 v : scene.verts) + fprintf(fp, "v %.9g %.9g %.9g\n", v.x, v.y, v.z); + for (TriangleIndices t : scene.triangles) + fprintf(fp, "f %d %d %d\n", 1 + t.a, 1 + t.b, 1 + t.c); + fclose(fp); +# endif +} + + +void +SimpleRaytracer::prepare_lights() +{ m_mesh_surfacearea.reserve(scene.last_index.size()); // measure the total surface area of each mesh @@ -1156,16 +1249,6 @@ SimpleRaytracer::prepare_render() if (!m_lightprims.empty()) errhandler().infofmt("Found {} triangles to be treated as lights", m_lightprims.size()); -#if 0 - // dump scene to disk as obj for debugging purposes - // TODO: make this a feature? - FILE* fp = fopen("/tmp/test.obj", "w"); - for (Vec3 v : scene.verts) - fprintf(fp, "v %.9g %.9g %.9g\n", v.x, v.y, v.z); - for (TriangleIndices t : scene.triangles) - fprintf(fp, "f %d %d %d\n", 1 + t.a, 1 + t.b, 1 + t.c); - fclose(fp); -#endif } @@ -1213,4 +1296,6 @@ SimpleRaytracer::clear() shaders().clear(); } +#endif // #ifndef __CUDACC__ + OSL_NAMESPACE_EXIT diff --git a/src/testrender/simpleraytracer.h b/src/testrender/simpleraytracer.h index 6dadf79cc..09b31380c 100644 --- a/src/testrender/simpleraytracer.h +++ b/src/testrender/simpleraytracer.h @@ -80,6 +80,7 @@ class SimpleRaytracer : public RendererServices { virtual void parse_scene_xml(const std::string& scenefile); virtual void prepare_render(); + virtual void prepare_lights(); virtual void warmup() {} virtual void render(int xres, int yres); virtual void clear(); @@ -93,6 +94,7 @@ class SimpleRaytracer : public RendererServices { OIIO::ErrorHandler& errhandler() const { return *m_errhandler; } const std::vector& shader_is_light() { return m_shader_is_light; } + const std::vector& lightprims() { return m_lightprims; } Camera camera; Scene scene; @@ -101,6 +103,9 @@ class SimpleRaytracer : public RendererServices { OIIO::ParamValueList options; OIIO::ImageBuf pixelbuf; + int getBackgroundShaderID() const { return backgroundShaderID; } + int getBackgroundResolution() const { return backgroundResolution; } + private: // Camera parameters Matrix44 m_world_to_camera; @@ -112,6 +117,7 @@ class SimpleRaytracer : public RendererServices { int backgroundShaderID = -1; int backgroundResolution = 1024; int aa = 1; + bool no_jitter = false; int max_bounces = 1000000; int rr_depth = 5; float show_albedo_scale = 0.0f; diff --git a/src/testrender/testrender.cpp b/src/testrender/testrender.cpp index 39f4c2e28..899e3fd2f 100644 --- a/src/testrender/testrender.cpp +++ b/src/testrender/testrender.cpp @@ -50,6 +50,7 @@ static std::string extraoptions; static std::string texoptions; static int xres = 640, yres = 480; static int aa = 1, max_bounces = 1000000, rr_depth = 5; +static bool no_jitter = false; static float show_albedo_scale = 0.0f; static int show_globals = 0; static int num_threads = 0; @@ -173,6 +174,8 @@ getargs(int argc, const char* argv[]) .hidden(); ap.arg("-aa %d:N", &aa) .help("Trace NxN rays per pixel"); + ap.arg("--no-jitter", &no_jitter) + .help("Disable AA pixel jitter"); ap.arg("-albedo %f:SCALE", &show_albedo_scale) .help("Visualize the albedo of each pixel instead of path tracing"); ap.arg("-normals") @@ -287,6 +290,7 @@ main(int argc, const char* argv[]) rend->attribute("max_bounces", max_bounces); rend->attribute("rr_depth", rr_depth); rend->attribute("aa", aa); + rend->attribute("no_jitter", (int)no_jitter); rend->attribute("show_albedo_scale", show_albedo_scale); rend->attribute("show_globals", show_globals); OIIO::attribute("threads", num_threads); diff --git a/src/testshade/CMakeLists.txt b/src/testshade/CMakeLists.txt index 17b3c3419..e8d72cfc0 100644 --- a/src/testshade/CMakeLists.txt +++ b/src/testshade/CMakeLists.txt @@ -16,7 +16,6 @@ if (OSL_USE_OPTIX) list (APPEND testshade_srcs optixgridrender.cpp) set ( testshade_cuda_srcs cuda/optix_grid_renderer.cu - ../testrender/cuda/wrapper.cu ) set (testshade_rend_lib_srcs @@ -25,7 +24,9 @@ if (OSL_USE_OPTIX) ) set ( testshade_cuda_headers - ../testrender/cuda/rend_lib.h ) + ../testrender/cuda/rend_lib.h + ../testrender/raytracer.h + ) # We need to make sure that the PTX files are regenerated whenever these # headers change. diff --git a/src/testshade/cuda/optix_grid_renderer.cu b/src/testshade/cuda/optix_grid_renderer.cu index 609e34057..8d569627a 100644 --- a/src/testshade/cuda/optix_grid_renderer.cu +++ b/src/testshade/cuda/optix_grid_renderer.cu @@ -29,7 +29,7 @@ OSL_NAMESPACE_EXIT extern "C" { -__device__ __constant__ RenderParams render_params; +__device__ __constant__ testshade::RenderParams render_params; } extern "C" __global__ void @@ -101,7 +101,7 @@ __raygen__() alignas(8) char closure_pool[256]; alignas(8) char params[256]; - ShaderGlobals sg; + OSL_CUDA::ShaderGlobals sg; // Setup the ShaderGlobals sg.I = make_float3(0, 0, 1); sg.N = make_float3(0, 0, 1); @@ -130,7 +130,7 @@ __raygen__() sg.backfacing = 0; // NB: These variables are not used in the current iteration of the sample - sg.raytype = CAMERA; + sg.raytype = OSL::Ray::CAMERA; sg.flipHandedness = 0; sg.shader2common = reinterpret_cast(render_params.shader2common); @@ -143,24 +143,30 @@ __raygen__() // Run the OSL group and init functions if (render_params.fused_callable) // call osl_init_func - optixDirectCall( - 0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/, - nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/, - 0 /*shadeindex - unused*/, - sbtdata->data /*interactive_params_ptr*/); + optixDirectCall(0u, &sg /*shaderglobals_ptr*/, + params /*groupdata_ptr*/, + nullptr /*userdata_base_ptr*/, + nullptr /*output_base_ptr*/, + 0 /*shadeindex - unused*/, + sbtdata->data /*interactive_params_ptr*/); else { // call osl_init_func - optixDirectCall( - 0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/, - nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/, - 0 /*shadeindex - unused*/, - sbtdata->data /*interactive_params_ptr*/); + optixDirectCall(0u, &sg /*shaderglobals_ptr*/, + params /*groupdata_ptr*/, + nullptr /*userdata_base_ptr*/, + nullptr /*output_base_ptr*/, + 0 /*shadeindex - unused*/, + sbtdata->data /*interactive_params_ptr*/); // call osl_group_func - optixDirectCall( - 1u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/, - nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/, - 0 /*shadeindex - unused*/, - sbtdata->data /*interactive_params_ptr*/); + optixDirectCall(1u, &sg /*shaderglobals_ptr*/, + params /*groupdata_ptr*/, + nullptr /*userdata_base_ptr*/, + nullptr /*output_base_ptr*/, + 0 /*shadeindex - unused*/, + sbtdata->data /*interactive_params_ptr*/); } float* f_output = (float*)params; @@ -171,8 +177,11 @@ __raygen__() // Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics" // let's do the texture look-ups in this file. extern "C" __device__ float4 -osl_tex2DLookup(void* handle, float s, float t) +osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, + float dsdy, float dtdy) { + const float2 dx = { dsdx, dtdx }; + const float2 dy = { dsdy, dtdy }; cudaTextureObject_t texID = cudaTextureObject_t(handle); - return tex2D(texID, s, t); + return tex2DGrad(texID, s, t, dx, dy); } diff --git a/src/testshade/optixgridrender.cpp b/src/testshade/optixgridrender.cpp index 3adb1ff52..877a5691f 100644 --- a/src/testshade/optixgridrender.cpp +++ b/src/testshade/optixgridrender.cpp @@ -14,7 +14,7 @@ #include "render_params.h" #include -#include +#include #include #include #include @@ -33,6 +33,9 @@ const auto optixModuleCreateFn = optixModuleCreate; #endif +using namespace testshade; + + OSL_NAMESPACE_ENTER @@ -83,6 +86,11 @@ OSL_NAMESPACE_ENTER } +#define DEVICE_ALLOC(size) reinterpret_cast(device_alloc(size)) +#define COPY_TO_DEVICE(dst_device, src_host, size) \ + copy_to_device(reinterpret_cast(dst_device), src_host, size) + + static void context_log_cb(unsigned int level, const char* tag, const char* message, void* /*cbdata */) @@ -177,10 +185,12 @@ OptixGridRenderer::load_ptx_file(string_view filename) OptixGridRenderer::~OptixGridRenderer() { - for (void* p : m_ptrs_to_free) - cudaFree(p); if (m_optix_ctx) OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx)); + for (CUdeviceptr ptr : m_ptrs_to_free) + cudaFree(reinterpret_cast(ptr)); + for (cudaArray_t arr : m_arrays_to_free) + cudaFreeArray(arr); } @@ -247,30 +257,24 @@ OptixGridRenderer::synch_attributes() const size_t podDataSize = cpuDataSize - sizeof(ustringhash) * numStrings; - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_color_system), - podDataSize + sizeof(uint64_t) * numStrings)); + d_color_system = DEVICE_ALLOC(podDataSize + + sizeof(uint64_t) * numStrings); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_color_system), colorSys, podDataSize, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_osl_printf_buffer), - OSL_PRINTF_BUFFER_SIZE)); + d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE); CUDA_CHECK(cudaMemset(reinterpret_cast(d_osl_printf_buffer), 0, OSL_PRINTF_BUFFER_SIZE)); // Transforms - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_object2common), - sizeof(OSL::Matrix44))); + d_object2common = DEVICE_ALLOC(sizeof(OSL::Matrix44)); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_object2common), &m_object2common, sizeof(OSL::Matrix44), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_shader2common), - sizeof(OSL::Matrix44))); + d_shader2common = DEVICE_ALLOC(sizeof(OSL::Matrix44)); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_shader2common), &m_shader2common, sizeof(OSL::Matrix44), cudaMemcpyHostToDevice)); - m_ptrs_to_free.push_back(reinterpret_cast(d_color_system)); - m_ptrs_to_free.push_back(reinterpret_cast(d_osl_printf_buffer)); - // then copy the device string to the end, first strings starting at dataPtr - (numStrings) // FIXME -- Should probably handle alignment better. const ustringhash* cpuStringHash @@ -725,26 +729,12 @@ OptixGridRenderer::make_optix_materials() setglobals_raygenRecord.data = nullptr; setglobals_missRecord.data = nullptr; - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_raygenRecord), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_missRecord), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_hitgroupRecord), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_callablesRecord), - callables * sizeof(GenericRecord))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_setglobals_raygenRecord), - sizeof(GenericRecord))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_setglobals_missRecord), - sizeof(GenericRecord))); - - m_ptrs_to_free.push_back(reinterpret_cast(d_raygenRecord)); - m_ptrs_to_free.push_back(reinterpret_cast(d_missRecord)); - m_ptrs_to_free.push_back(reinterpret_cast(d_hitgroupRecord)); - m_ptrs_to_free.push_back(reinterpret_cast(d_callablesRecord)); - m_ptrs_to_free.push_back( - reinterpret_cast(d_setglobals_raygenRecord)); - m_ptrs_to_free.push_back(reinterpret_cast(d_setglobals_missRecord)); + d_raygenRecord = DEVICE_ALLOC(sizeof(GenericRecord)); + d_missRecord = DEVICE_ALLOC(sizeof(GenericRecord)); + d_hitgroupRecord = DEVICE_ALLOC(sizeof(GenericRecord)); + d_callablesRecord = DEVICE_ALLOC(callables * sizeof(GenericRecord)); + d_setglobals_raygenRecord = DEVICE_ALLOC(sizeof(GenericRecord)); + d_setglobals_missRecord = DEVICE_ALLOC(sizeof(GenericRecord)); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_raygenRecord), &raygenRecord, sizeof(GenericRecord), @@ -817,53 +807,82 @@ OptixGridRenderer::get_texture_handle(ustring filename, { auto itr = m_samplers.find(filename); if (itr == m_samplers.end()) { - // Open image + // Open image to check the number of mip levels OIIO::ImageBuf image; if (!image.init_spec(filename, 0, 0)) { errhandler().errorfmt("Could not load: {} (hash {})", filename, filename); return (TextureHandle*)nullptr; } - - OIIO::ROI roi = OIIO::get_roi_full(image.spec()); - int32_t width = roi.width(), height = roi.height(); - std::vector pixels(width * height * 4); - - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]); - } - } - cudaResourceDesc res_desc = {}; + int32_t nmiplevels = std::max(image.nmiplevels(), 1); + int32_t img_width = image.xmax() + 1; + int32_t img_height = image.ymax() + 1; // hard-code textures to 4 channels - int32_t pitch = width * 4 * sizeof(float); cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); - cudaArray_t pixelArray; - CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height)); + cudaMipmappedArray_t mipmapArray; + cudaExtent extent = make_cudaExtent(img_width, img_height, 0); + CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent, + nmiplevels)); + + // Copy the pixel data for each mip level + std::vector> level_pixels(nmiplevels); + for (int32_t level = 0; level < nmiplevels; ++level) { + image.reset(filename, 0, level); + OIIO::ROI roi = OIIO::get_roi_full(image.spec()); + if (!roi.defined()) { + errhandler().errorfmt( + "Could not load mip level {}: {} (hash {})", level, + filename, filename); + return (TextureHandle*)nullptr; + } - m_ptrs_to_free.push_back(reinterpret_cast(pixelArray)); + int32_t width = roi.width(), height = roi.height(); + level_pixels[level].resize(width * height * 4); + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + image.getpixel(i, j, 0, + &level_pixels[level][((j * width) + i) * 4]); + } + } - CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, - /* offset */ 0, 0, pixels.data(), pitch, - pitch, height, cudaMemcpyHostToDevice)); + cudaArray_t miplevelArray; + CUDA_CHECK( + cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level)); - res_desc.resType = cudaResourceTypeArray; - res_desc.res.array.array = pixelArray; + // Copy the texel data into the miplevel array + int32_t pitch = width * 4 * sizeof(float); + CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0, + level_pixels[level].data(), pitch, + pitch, height, + cudaMemcpyHostToDevice)); + } - cudaTextureDesc tex_desc = {}; - tex_desc.addressMode[0] = cudaAddressModeWrap; - tex_desc.addressMode[1] = cudaAddressModeWrap; - tex_desc.filterMode = cudaFilterModeLinear; - tex_desc.readMode - = cudaReadModeElementType; //cudaReadModeNormalizedFloat; + int32_t pitch = img_width * 4 * sizeof(float); + cudaArray_t pixelArray; + CUDA_CHECK( + cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height)); + CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(), + pitch, pitch, img_height, + cudaMemcpyHostToDevice)); + m_arrays_to_free.push_back(pixelArray); + + cudaResourceDesc res_desc = {}; + res_desc.resType = cudaResourceTypeMipmappedArray; + res_desc.res.mipmap.mipmap = mipmapArray; + + cudaTextureDesc tex_desc = {}; + tex_desc.addressMode[0] = cudaAddressModeWrap; + tex_desc.addressMode[1] = cudaAddressModeWrap; + tex_desc.filterMode = cudaFilterModeLinear; + tex_desc.readMode = cudaReadModeElementType; tex_desc.normalizedCoords = 1; tex_desc.maxAnisotropy = 1; - tex_desc.maxMipmapLevelClamp = 99; + tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1); tex_desc.minMipmapLevelClamp = 0; - tex_desc.mipmapFilterMode = cudaFilterModePoint; + tex_desc.mipmapFilterMode = cudaFilterModeLinear; tex_desc.borderColor[0] = 1.0f; tex_desc.sRGB = 0; @@ -907,13 +926,8 @@ OptixGridRenderer::warmup() void OptixGridRenderer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED) { - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_output_buffer), - xres * yres * 4 * sizeof(float))); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_launch_params), - sizeof(RenderParams))); - - m_ptrs_to_free.push_back(reinterpret_cast(d_output_buffer)); - m_ptrs_to_free.push_back(reinterpret_cast(d_launch_params)); + d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float)); + d_launch_params = DEVICE_ALLOC(sizeof(RenderParams)); m_xres = xres; m_yres = yres; @@ -1119,19 +1133,15 @@ OptixGridRenderer::register_named_transforms() } // Push the names and transforms to the device - size_t sz = sizeof(uint64_t) * xform_name_buffer.size(); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_xform_name_buffer), sz)); + size_t sz = sizeof(uint64_t) * xform_name_buffer.size(); + d_xform_name_buffer = DEVICE_ALLOC(sz); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_xform_name_buffer), xform_name_buffer.data(), sz, cudaMemcpyHostToDevice)); - m_ptrs_to_free.push_back(reinterpret_cast(d_xform_name_buffer)); - - sz = sizeof(OSL::Matrix44) * xform_buffer.size(); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_xform_buffer), sz)); + sz = sizeof(OSL::Matrix44) * xform_buffer.size(); + d_xform_buffer = DEVICE_ALLOC(sz); CUDA_CHECK(cudaMemcpy(reinterpret_cast(d_xform_buffer), xform_buffer.data(), sz, cudaMemcpyHostToDevice)); - m_ptrs_to_free.push_back(reinterpret_cast(d_xform_buffer)); - m_num_named_xforms = xform_name_buffer.size(); } diff --git a/src/testshade/optixgridrender.h b/src/testshade/optixgridrender.h index 5f630f922..9cb2719fd 100644 --- a/src/testshade/optixgridrender.h +++ b/src/testshade/optixgridrender.h @@ -104,7 +104,8 @@ class OptixGridRenderer final : public SimpleRenderer { OSL::Matrix44 m_object2common; // "object" space to "common" space matrix // CUdeviceptrs that need to be freed after we are done - std::vector m_ptrs_to_free; + std::vector m_ptrs_to_free; + std::vector m_arrays_to_free; }; diff --git a/src/testshade/render_params.h b/src/testshade/render_params.h index c2e9c26c9..6670fe099 100644 --- a/src/testshade/render_params.h +++ b/src/testshade/render_params.h @@ -4,6 +4,8 @@ #pragma once +namespace testshade { + struct RenderParams { float invw; float invh; @@ -40,3 +42,5 @@ struct GenericRecord { // What follows should duplicate GenericData void* data; }; + +} // namespace testshade diff --git a/src/testshade/rs_simplerend.cpp b/src/testshade/rs_simplerend.cpp index 0cf9b8ca8..13c3ce45c 100644 --- a/src/testshade/rs_simplerend.cpp +++ b/src/testshade/rs_simplerend.cpp @@ -188,7 +188,8 @@ rs_transform_points(OSL::OpaqueExecContextPtr /*ec*/, OSL::ustringhash /*from*/, // doesn't know how to handle CUDA texture intrinsics. This function must be // defined in the CUDA source for testshade and testrender. extern "C" __device__ float4 -osl_tex2DLookup(void* handle, float s, float t); +osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx, + float dsdy, float dtdy); #endif OSL_RSOP OSL_HOSTDEVICE bool @@ -208,7 +209,8 @@ rs_texture(OSL::OpaqueExecContextPtr ec, OSL::ustringhash filename, #else if (!texture_handle) return false; - const float4 fromTexture = osl_tex2DLookup((void*)texture_handle, s, t); + const float4 fromTexture = osl_tex2DLookup(texture_handle, s, t, dsdx, dtdx, + dsdy, dtdy); *((float3*)result) = make_float3(fromTexture.x, fromTexture.y, fromTexture.z); return true; diff --git a/testsuite/render-background/OPTIX b/testsuite/render-background/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-background/OPTIX_OPTIMIZEONLY b/testsuite/render-background/OPTIX_OPTIMIZEONLY new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-background/ref/out-optix-alt.exr b/testsuite/render-background/ref/out-optix-alt.exr new file mode 100644 index 000000000..452ced070 Binary files /dev/null and b/testsuite/render-background/ref/out-optix-alt.exr differ diff --git a/testsuite/render-background/run.py b/testsuite/render-background/run.py index 0da23d621..b3122b811 100755 --- a/testsuite/render-background/run.py +++ b/testsuite/render-background/run.py @@ -7,6 +7,8 @@ failthresh = 0.01 failpercent = 1 hardfail = 0.11 +allowfailures = 5 +idiff_program = "idiff" outputs = [ "out.exr" ] command = testrender("-r 320 240 -aa 4 scene.xml out.exr") diff --git a/testsuite/render-bumptest/OPTIX b/testsuite/render-bumptest/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-bumptest/ref/out-optix-alt.exr b/testsuite/render-bumptest/ref/out-optix-alt.exr new file mode 100644 index 000000000..3371916bd Binary files /dev/null and b/testsuite/render-bumptest/ref/out-optix-alt.exr differ diff --git a/testsuite/render-bunny/OPTIX b/testsuite/render-bunny/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-bunny/ref/out-optix-alt.exr b/testsuite/render-bunny/ref/out-optix-alt.exr new file mode 100644 index 000000000..db5e84a67 Binary files /dev/null and b/testsuite/render-bunny/ref/out-optix-alt.exr differ diff --git a/testsuite/render-cornell/OPTIX b/testsuite/render-cornell/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-cornell/ref/out-optix-alt.exr b/testsuite/render-cornell/ref/out-optix-alt.exr new file mode 100644 index 000000000..d5a2355b2 Binary files /dev/null and b/testsuite/render-cornell/ref/out-optix-alt.exr differ diff --git a/testsuite/render-furnace-diffuse/OPTIX b/testsuite/render-furnace-diffuse/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr b/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr new file mode 100644 index 000000000..dce1374be Binary files /dev/null and b/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr differ diff --git a/testsuite/render-material-layer/OPTIX b/testsuite/render-material-layer/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-microfacet/OPTIX b/testsuite/render-microfacet/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-microfacet/OPTIX_OPTIMIZEONLY b/testsuite/render-microfacet/OPTIX_OPTIMIZEONLY new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-microfacet/ref/out-optix-alt.exr b/testsuite/render-microfacet/ref/out-optix-alt.exr new file mode 100644 index 000000000..d6bc304bf Binary files /dev/null and b/testsuite/render-microfacet/ref/out-optix-alt.exr differ diff --git a/testsuite/render-microfacet/run.py b/testsuite/render-microfacet/run.py index cf55f64ff..1dac3ca98 100755 --- a/testsuite/render-microfacet/run.py +++ b/testsuite/render-microfacet/run.py @@ -6,6 +6,8 @@ failthresh = 0.02 failpercent = 1 +allowfailures = 5 +idiff_program = "idiff" outputs = [ "out.exr" ] command = testrender("-r 320 240 -aa 8 scene.xml out.exr") diff --git a/testsuite/render-mx-burley-diffuse/OPTIX b/testsuite/render-mx-burley-diffuse/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr b/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr new file mode 100644 index 000000000..5ef99dd1c Binary files /dev/null and b/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-conductor/OPTIX b/testsuite/render-mx-conductor/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-conductor/ref/out-optix-alt.exr b/testsuite/render-mx-conductor/ref/out-optix-alt.exr new file mode 100644 index 000000000..b97249a52 Binary files /dev/null and b/testsuite/render-mx-conductor/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-conductor/run.py b/testsuite/render-mx-conductor/run.py index b01c78cc6..ba901ef83 100755 --- a/testsuite/render-mx-conductor/run.py +++ b/testsuite/render-mx-conductor/run.py @@ -7,6 +7,8 @@ failthresh = 0.01 failpercent = 1 hardfail = 0.025 +allowfailures = 5 +idiff_program = "idiff" outputs = [ "out.exr" ] command = testrender("-v -r 320 240 -aa 16 scene.xml out.exr") diff --git a/testsuite/render-mx-dielectric-glass/OPTIX b/testsuite/render-mx-dielectric-glass/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr b/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr new file mode 100644 index 000000000..a123df055 Binary files /dev/null and b/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-dielectric/OPTIX b/testsuite/render-mx-dielectric/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-dielectric/ref/out-optix-alt.exr b/testsuite/render-mx-dielectric/ref/out-optix-alt.exr new file mode 100644 index 000000000..527bdf88d Binary files /dev/null and b/testsuite/render-mx-dielectric/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-furnace-burley-diffuse/OPTIX b/testsuite/render-mx-furnace-burley-diffuse/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr new file mode 100644 index 000000000..55a223787 Binary files /dev/null and b/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-furnace-oren-nayar/OPTIX b/testsuite/render-mx-furnace-oren-nayar/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr new file mode 100644 index 000000000..9e7a50566 Binary files /dev/null and b/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-furnace-sheen/OPTIX b/testsuite/render-mx-furnace-sheen/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr new file mode 100644 index 000000000..610f308c6 Binary files /dev/null and b/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-generalized-schlick-glass/OPTIX b/testsuite/render-mx-generalized-schlick-glass/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr b/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr new file mode 100644 index 000000000..0cdbd7065 Binary files /dev/null and b/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-generalized-schlick/OPTIX b/testsuite/render-mx-generalized-schlick/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr b/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr new file mode 100644 index 000000000..d9d54c51e Binary files /dev/null and b/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-layer/OPTIX b/testsuite/render-mx-layer/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-layer/ref/out-optix-alt.exr b/testsuite/render-mx-layer/ref/out-optix-alt.exr new file mode 100644 index 000000000..185e9c72d Binary files /dev/null and b/testsuite/render-mx-layer/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-layer/run.py b/testsuite/render-mx-layer/run.py index 7688fd939..62f9b4aae 100755 --- a/testsuite/render-mx-layer/run.py +++ b/testsuite/render-mx-layer/run.py @@ -6,5 +6,8 @@ failthresh = 0.01 failpercent = 1 +allowfailures = 5 +idiff_program = "idiff" + outputs = [ "out.exr" ] command = testrender("-r 320 240 -aa 6 scene.xml out.exr") diff --git a/testsuite/render-mx-sheen/OPTIX b/testsuite/render-mx-sheen/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-mx-sheen/ref/out-optix-alt.exr b/testsuite/render-mx-sheen/ref/out-optix-alt.exr new file mode 100644 index 000000000..c1e337907 Binary files /dev/null and b/testsuite/render-mx-sheen/ref/out-optix-alt.exr differ diff --git a/testsuite/render-mx-sheen/run.py b/testsuite/render-mx-sheen/run.py index 7688fd939..62f9b4aae 100755 --- a/testsuite/render-mx-sheen/run.py +++ b/testsuite/render-mx-sheen/run.py @@ -6,5 +6,8 @@ failthresh = 0.01 failpercent = 1 +allowfailures = 5 +idiff_program = "idiff" + outputs = [ "out.exr" ] command = testrender("-r 320 240 -aa 6 scene.xml out.exr") diff --git a/testsuite/render-oren-nayar/OPTIX b/testsuite/render-oren-nayar/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db b/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db new file mode 100644 index 000000000..feb40a1c6 Binary files /dev/null and b/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db differ diff --git a/testsuite/render-oren-nayar/ref/out-optix-alt.exr b/testsuite/render-oren-nayar/ref/out-optix-alt.exr new file mode 100644 index 000000000..ee707b950 Binary files /dev/null and b/testsuite/render-oren-nayar/ref/out-optix-alt.exr differ diff --git a/testsuite/render-raytypes/OPTIX b/testsuite/render-raytypes/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-raytypes/ref/out-optix-alt.exr b/testsuite/render-raytypes/ref/out-optix-alt.exr new file mode 100644 index 000000000..d9703c510 Binary files /dev/null and b/testsuite/render-raytypes/ref/out-optix-alt.exr differ diff --git a/testsuite/render-uv/OPTIX b/testsuite/render-uv/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-uv/OPTIX_OPTIMIZEONLY b/testsuite/render-uv/OPTIX_OPTIMIZEONLY new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-uv/ref/out-optix-alt.exr b/testsuite/render-uv/ref/out-optix-alt.exr new file mode 100644 index 000000000..90f5567ee Binary files /dev/null and b/testsuite/render-uv/ref/out-optix-alt.exr differ diff --git a/testsuite/render-veachmis/OPTIX b/testsuite/render-veachmis/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-veachmis/ref/out-optix-alt.exr b/testsuite/render-veachmis/ref/out-optix-alt.exr new file mode 100644 index 000000000..120ee76a7 Binary files /dev/null and b/testsuite/render-veachmis/ref/out-optix-alt.exr differ diff --git a/testsuite/render-ward/OPTIX b/testsuite/render-ward/OPTIX new file mode 100644 index 000000000..e69de29bb diff --git a/testsuite/render-ward/ref/out-optix-alt.exr b/testsuite/render-ward/ref/out-optix-alt.exr new file mode 100644 index 000000000..97a08c9d9 Binary files /dev/null and b/testsuite/render-ward/ref/out-optix-alt.exr differ diff --git a/testsuite/testoptix-noise/ref/out.exr b/testsuite/testoptix-noise/ref/out.exr index 193abb522..8de6deae5 100644 Binary files a/testsuite/testoptix-noise/ref/out.exr and b/testsuite/testoptix-noise/ref/out.exr differ diff --git a/testsuite/testoptix-noise/ref/out_02.exr b/testsuite/testoptix-noise/ref/out_02.exr index 185c6f046..b4070fd73 100644 Binary files a/testsuite/testoptix-noise/ref/out_02.exr and b/testsuite/testoptix-noise/ref/out_02.exr differ diff --git a/testsuite/testoptix-noise/run.py b/testsuite/testoptix-noise/run.py index 865b73183..3a787e3dd 100755 --- a/testsuite/testoptix-noise/run.py +++ b/testsuite/testoptix-noise/run.py @@ -7,5 +7,5 @@ failthresh = 0.03 # allow a little more LSB noise between platforms failpercent = .5 outputs = [ "out.exr", "out_02.exr" ] -command = testrender("-optix -res 320 240 scene.xml out.exr") -command += testrender("-optix -res 320 240 scene_02.xml out_02.exr") +command = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr") +command += testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene_02.xml out_02.exr") diff --git a/testsuite/testoptix-reparam/ref/out.exr b/testsuite/testoptix-reparam/ref/out.exr index fc1006e04..b8eb11c8e 100644 Binary files a/testsuite/testoptix-reparam/ref/out.exr and b/testsuite/testoptix-reparam/ref/out.exr differ diff --git a/testsuite/testoptix-reparam/run.py b/testsuite/testoptix-reparam/run.py index ef900eae8..b196355b0 100755 --- a/testsuite/testoptix-reparam/run.py +++ b/testsuite/testoptix-reparam/run.py @@ -7,4 +7,4 @@ failthresh = 0.03 # allow a little more LSB noise between platforms failpercent = .5 outputs = [ "out.exr" ] -command = testrender("-optix -res 320 240 scene.xml out.exr") +command = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr") diff --git a/testsuite/testoptix/ref/out.exr b/testsuite/testoptix/ref/out.exr index b6e346156..9f6bc6fa8 100644 Binary files a/testsuite/testoptix/ref/out.exr and b/testsuite/testoptix/ref/out.exr differ diff --git a/testsuite/testoptix/ref/test_microfacet_dist.exr b/testsuite/testoptix/ref/test_microfacet_dist.exr deleted file mode 100644 index 95655f1d8..000000000 Binary files a/testsuite/testoptix/ref/test_microfacet_dist.exr and /dev/null differ diff --git a/testsuite/testoptix/ref/test_spline.exr b/testsuite/testoptix/ref/test_spline.exr index 3b4c9a374..4405d8f4e 100644 Binary files a/testsuite/testoptix/ref/test_spline.exr and b/testsuite/testoptix/ref/test_spline.exr differ diff --git a/testsuite/testoptix/ref/test_texture.exr b/testsuite/testoptix/ref/test_texture.exr index 53a882084..22b17905b 100644 Binary files a/testsuite/testoptix/ref/test_texture.exr and b/testsuite/testoptix/ref/test_texture.exr differ diff --git a/testsuite/testoptix/run.py b/testsuite/testoptix/run.py index 833915c27..78148d793 100755 --- a/testsuite/testoptix/run.py +++ b/testsuite/testoptix/run.py @@ -6,15 +6,14 @@ failthresh = 0.03 # allow a little more LSB noise between platforms failpercent = .5 -outputs = [ "out.exr", "test_microfacet_dist.exr", "test_texture.exr", "test_spline.exr", "out.txt" ] -command = testrender("-optix -res 320 240 scene.xml out.exr") -command += testrender("-optix -res 320 240 test_microfacet_dist.xml test_microfacet_dist.exr") -command += testrender("-optix -res 1 1 test_print.xml dummy.exr") -command += testrender("-optix -res 1 1 test_compare.xml dummy.exr") -command += testrender("-optix -res 1 1 test_assign.xml dummy.exr") -command += testrender("-optix -res 1 1 test_assign_02.xml dummy.exr") -command += testrender("-optix -res 1 1 test_str_ops.xml dummy.exr") -command += testrender("-optix -res 1 1 test_userdata_string.xml dummy.exr") +outputs = [ "out.exr", "test_texture.exr", "test_spline.exr", "out.txt" ] +command = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_print.xml dummy.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_compare.xml dummy.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign.xml dummy.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign_02.xml dummy.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_str_ops.xml dummy.exr") +command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_userdata_string.xml dummy.exr") command += testshade("-optix -res 256 256 test_spline -o Cout test_spline.exr") command += testshade("-optix -res 512 512 test_texture -o Cout test_texture.exr")