diff --git a/src/cmake/testing.cmake b/src/cmake/testing.cmake
index a2de5e315..fd30c4199 100644
--- a/src/cmake/testing.cmake
+++ b/src/cmake/testing.cmake
@@ -179,7 +179,8 @@ macro ( TESTSUITE )
             AND NOT EXISTS "${_testsrcdir}/NOOPTIX-FIXME"
             AND NOT EXISTS "${_testsrcdir}/BATCHED_REGRESSION")
             # Unoptimized
-            if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY")
+            if (NOT EXISTS "${_testsrcdir}/OPTIMIZEONLY"
+                AND NOT EXISTS "${_testsrcdir}/OPTIX_OPTIMIZEONLY")
                 add_one_testsuite ("${_testname}.optix" "${_testsrcdir}"
                                    ENV TESTSHADE_OPT=0 TESTSHADE_OPTIX=1 )
             endif ()
diff --git a/src/include/OSL/platform.h b/src/include/OSL/platform.h
index 39fb4496c..33268e732 100644
--- a/src/include/OSL/platform.h
+++ b/src/include/OSL/platform.h
@@ -481,7 +481,11 @@
 /// to use regular assert() for this purpose if you need to eliminate the
 /// dependency on this header from a particular place (and don't mind that
 /// assert won't format identically on all platforms).
-#ifndef NDEBUG
+///
+/// These macros are no-ops when compiling for CUDA because they were found
+/// to cause strange issues in device code (e.g., function bodies being
+/// eliminated when OSL_DASSERT is used).
+#if !defined(NDEBUG) && !defined(__CUDACC__)
 #    define OSL_DASSERT OSL_ASSERT
 #    define OSL_DASSERT_MSG OSL_ASSERT_MSG
 #else
diff --git a/src/testrender/CMakeLists.txt b/src/testrender/CMakeLists.txt
index 590f26447..c4ab69e7a 100644
--- a/src/testrender/CMakeLists.txt
+++ b/src/testrender/CMakeLists.txt
@@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
     list (APPEND testrender_srcs optixraytracer.cpp)
     set (testrender_cuda_srcs
         cuda/optix_raytracer.cu
-        cuda/wrapper.cu
         )
 
     set (testrender_rend_lib_srcs
@@ -25,17 +24,22 @@ if (OSL_USE_OPTIX)
         )
 
     # We need to make sure that the PTX files are regenerated whenever these
-    # headers change.
+    # files change.
     set (testrender_cuda_headers
         cuda/rend_lib.h
-        render_params.h)
-
-    set ( extra_cuda_headers
-        render_params.h )
+        background.h
+        optics.h
+        render_params.h
+        raytracer.h
+        sampling.h
+        shading.h
+        shading.cpp
+        simpleraytracer.cpp
+        )
 
     # Generate PTX for all of the CUDA files
     foreach (cudasrc ${testrender_cuda_srcs})
-        NVCC_COMPILE ( ${cudasrc} ${extra_cuda_headers} ptx_generated "" )
+        NVCC_COMPILE ( ${cudasrc} "${testrender_cuda_headers}" ptx_generated "" )
         list (APPEND ptx_list ${ptx_generated})
     endforeach ()
 
@@ -55,7 +59,7 @@ if (OSL_USE_OPTIX)
     list (APPEND ptx_list ${rend_lib_ptx})
 
     add_custom_target (testrender_ptx ALL
-        DEPENDS ${ptx_list}
+        DEPENDS ${ptx_list} ${testrender_cuda_headers}
         SOURCES ${testrender_cuda_srcs} )
 
     # Install the PTX files in a fixed location so that they can be
diff --git a/src/testrender/background.h b/src/testrender/background.h
index ac196cf27..e6daa28af 100644
--- a/src/testrender/background.h
+++ b/src/testrender/background.h
@@ -10,17 +10,48 @@
 
 OSL_NAMESPACE_ENTER
 
+
+// std::upper_bound is not supported in device code, so define a version of it here.
+// Adapted from the LLVM Project, see https://llvm.org/LICENSE.txt for license information.
+template<typename T>
+inline OSL_HOSTDEVICE const T*
+upper_bound(const T* data, int count, const T value)
+{
+    const T* first = data;
+    const T value_ = value;
+    int len        = count;
+    while (len != 0) {
+        int l2     = len / 2;
+        const T* m = first;
+        m += l2;
+        if (value_ < *m)
+            len = l2;
+        else {
+            first = ++m;
+            len -= l2 + 1;
+        }
+    }
+    return first;
+}
+
+
 struct Background {
+    OSL_HOSTDEVICE
     Background() : values(0), rows(0), cols(0) {}
+
+    OSL_HOSTDEVICE
     ~Background()
     {
+#ifndef __CUDACC__
         delete[] values;
         delete[] rows;
         delete[] cols;
+#endif
     }
 
     template<typename F, typename T> void prepare(int resolution, F cb, T* data)
     {
+        // These values are set via set_variables() in CUDA
         res = resolution;
         if (res < 32)
             res = 32;  // validate
@@ -29,6 +60,7 @@ struct Background {
         values      = new Vec3[res * res];
         rows        = new float[res];
         cols        = new float[res * res];
+
         for (int y = 0, i = 0; y < res; y++) {
             for (int x = 0; x < res; x++, i++) {
                 values[i] = cb(map(x + 0.5f, y + 0.5f), data);
@@ -43,8 +75,9 @@ struct Background {
                     cols[i - res + x] /= cols[i - 1];
         }
         // normalize the pdf across all scanlines
-        for (int y = 0; y < res; y++)
+        for (int y = 0; y < res; y++) {
             rows[y] /= rows[res - 1];
+        }
 
         // both eval and sample below return a "weight" that is
         // value[i] / row*col_pdf, so might as well bake it into the table
@@ -65,6 +98,7 @@ struct Background {
 #endif
     }
 
+    OSL_HOSTDEVICE
     Vec3 eval(const Vec3& dir, float& pdf) const
     {
         // map from sphere to unit-square
@@ -90,6 +124,7 @@ struct Background {
         return values[i];
     }
 
+    OSL_HOSTDEVICE
     Vec3 sample(float rx, float ry, Dual2<Vec3>& dir, float& pdf) const
     {
         float row_pdf, col_pdf;
@@ -101,8 +136,98 @@ struct Background {
         return values[y * res + x];
     }
 
+#ifdef __CUDACC__
+    OSL_HOSTDEVICE
+    void set_variables(Vec3* values_in, float* rows_in, float* cols_in,
+                       int res_in)
+    {
+        values      = values_in;
+        rows        = rows_in;
+        cols        = cols_in;
+        res         = res_in;
+        invres      = __frcp_rn(res);
+        invjacobian = __fdiv_rn(res * res, float(4 * M_PI));
+        assert(res >= 32);
+    }
+
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda(int stride, int idx, F cb)
+    {
+        // N.B. This needs to run on a single-warp launch, since there is no
+        // synchronization across warps in OptiX.
+        prepare_cuda_01(stride, idx, cb);
+        if (idx == 0)
+            prepare_cuda_02();
+        prepare_cuda_03(stride, idx);
+    }
+
+    // Pre-compute the 'values' table in parallel
+    template<typename F>
+    OSL_HOSTDEVICE void prepare_cuda_01(int stride, int idx, F cb)
+    {
+        for (int y = 0; y < res; y++) {
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                values[i] = cb(map(x + 0.5f, y + 0.5f));
+            }
+        }
+    }
+
+    // Compute 'cols' and 'rows' using a single thread
+    OSL_HOSTDEVICE void prepare_cuda_02()
+    {
+        for (int y = 0, i = 0; y < res; y++) {
+            for (int x = 0; x < res; x++, i++) {
+                cols[i] = std::max(std::max(values[i].x, values[i].y),
+                                   values[i].z)
+                          + ((x > 0) ? cols[i - 1] : 0.0f);
+            }
+            rows[y] = cols[i - 1] + ((y > 0) ? rows[y - 1] : 0.0f);
+            // normalize the pdf for this scanline (if it was non-zero)
+            if (cols[i - 1] > 0) {
+                for (int x = 0; x < res; x++) {
+                    cols[i - res + x] = __fdiv_rn(cols[i - res + x],
+                                                  cols[i - 1]);
+                }
+            }
+        }
+    }
+
+    // Normalize the row PDFs and finalize the 'values' table
+    OSL_HOSTDEVICE void prepare_cuda_03(int stride, int idx)
+    {
+        // normalize the pdf across all scanlines
+        for (int y = idx; y < res; y += stride) {
+            rows[y] = __fdiv_rn(rows[y], rows[res - 1]);
+        }
+
+        // both eval and sample below return a "weight" that is
+        // value[i] / row*col_pdf, so might as well bake it into the table
+        for (int y = 0; y < res; y++) {
+            float row_pdf       = rows[y] - (y > 0 ? rows[y - 1] : 0.0f);
+            const int row_start = y * res;
+            const int row_end   = row_start + res;
+            int i               = row_start + idx;
+            for (int x = idx; x < res; x += stride, i += stride) {
+                if (i >= row_end)
+                    continue;
+                float col_pdf       = cols[i] - (x > 0 ? cols[i - 1] : 0.0f);
+                const float divisor = __fmul_rn(__fmul_rn(row_pdf, col_pdf),
+                                                invjacobian);
+                values[i].x         = __fdiv_rn(values[i].x, divisor);
+                values[i].y         = __fdiv_rn(values[i].y, divisor);
+                values[i].z         = __fdiv_rn(values[i].z, divisor);
+            }
+        }
+    }
+#endif
+
 private:
-    Dual2<Vec3> map(float x, float y) const
+    OSL_HOSTDEVICE Dual2<Vec3> map(float x, float y) const
     {
         // pixel coordinates of entry (x,y)
         Dual2<float> u     = Dual2<float>(x, 1, 0) * invres;
@@ -115,14 +240,16 @@ struct Background {
         return make_Vec3(sin_phi * ct, sin_phi * st, cos_phi);
     }
 
-    static float sample_cdf(const float* data, unsigned int n, float x,
-                            unsigned int* idx, float* pdf)
+    static OSL_HOSTDEVICE float sample_cdf(const float* data, unsigned int n,
+                                           float x, unsigned int* idx,
+                                           float* pdf)
     {
-        OSL_DASSERT(x >= 0);
-        OSL_DASSERT(x < 1);
-        *idx = std::upper_bound(data, data + n, x) - data;
+        OSL_DASSERT(x >= 0.0f);
+        OSL_DASSERT(x < 1.0f);
+        *idx = OSL::upper_bound(data, n, x) - data;
         OSL_DASSERT(*idx < n);
         OSL_DASSERT(x < data[*idx]);
+
         float scaled_sample;
         if (*idx == 0) {
             *pdf          = data[0];
@@ -137,12 +264,13 @@ struct Background {
         return std::min(scaled_sample, 0.99999994f);
     }
 
-    Vec3* values;  // actual map
-    float* rows;   // probability of choosing a given row 'y'
-    float* cols;  // probability of choosing a given column 'x', given that we've chosen row 'y'
-    int res;       // resolution in pixels of the precomputed table
-    float invres;  // 1 / resolution
-    float invjacobian;
+    Vec3* values = nullptr;  // actual map
+    float* rows  = nullptr;  // probability of choosing a given row 'y'
+    float* cols
+        = nullptr;  // probability of choosing a given column 'x', given that we've chosen row 'y'
+    int res           = -1;    // resolution in pixels of the precomputed table
+    float invres      = 0.0f;  // 1 / resolution
+    float invjacobian = 0.0f;
 };
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/cuda/optix_raytracer.cu b/src/testrender/cuda/optix_raytracer.cu
index 631055f48..513dca223 100644
--- a/src/testrender/cuda/optix_raytracer.cu
+++ b/src/testrender/cuda/optix_raytracer.cu
@@ -4,16 +4,29 @@
 
 
 #include <optix.h>
-
-#include "util.h"
-
 #include <optix_device.h>
 
 #include <OSL/hashes.h>
 
-#include "../render_params.h"
+#include "optix_raytracer.h"
 #include "rend_lib.h"
 
+#include "../background.h"
+#include "../bvh.h"
+#include "../raytracer.h"
+#include "../render_params.h"
+#include "../sampling.h"
+#include "../shading.cpp"
+
+#include <cstdint>
+
+
+// Conversion macros for casting between vector types
+#define F3_TO_V3(f3) (*reinterpret_cast<const OSL::Vec3*>(&f3))
+#define F3_TO_C3(f3) (*reinterpret_cast<const OSL::Color3*>(&f3))
+#define V3_TO_F3(v3) (*reinterpret_cast<const float3*>(&v3))
+#define C3_TO_F3(c3) (*reinterpret_cast<const float3*>(&c3))
+
 
 OSL_NAMESPACE_ENTER
 namespace pvt {
@@ -34,6 +47,171 @@ __device__ __constant__ RenderParams render_params;
 }
 
 
+static inline __device__ void
+execute_shader(ShaderGlobalsType& sg, const int shader_id, char* closure_pool)
+{
+    if (shader_id < 0) {
+        // TODO: should probably never get here ...
+        return;
+    }
+
+    // Pack the "closure pool" into one of the ShaderGlobals pointers
+    *(int*)&closure_pool[0] = 0;
+    sg.renderstate          = &closure_pool[0];
+
+    // Pack the pointers to the options structs in a faux "context",
+    // which is a rough stand-in for the host ShadingContext.
+    ShadingContextCUDA shading_context;
+    sg.context = &shading_context;
+
+    // Run the OSL callable
+    void* interactive_ptr = reinterpret_cast<void**>(
+        render_params.interactive_params)[shader_id];
+    const unsigned int shaderIdx = shader_id + 0u;
+    optixDirectCall<void, ShaderGlobalsType*, void*, void*, void*, int, void*>(
+        shaderIdx, &sg /*shaderglobals_ptr*/, nullptr /*groupdata_ptr*/,
+        nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
+        0 /*shadeindex - unused*/, interactive_ptr /*interactive_params_ptr*/
+    );
+}
+
+
+static inline __device__ void
+trace_ray(OptixTraversableHandle handle, Payload& payload, const float3& origin,
+          const float3& direction, const float tmin)
+{
+    uint32_t p0 = static_cast<uint32_t>(payload.hit_id);
+    uint32_t p1 = __float_as_uint(payload.hit_t);
+    uint32_t p2 = __float_as_uint(payload.hit_u);
+    uint32_t p3 = __float_as_uint(payload.hit_v);
+    optixTrace(handle,                         // handle
+               origin,                         // origin
+               direction,                      // direction
+               tmin,                           // tmin
+               1e13f,                          // tmax
+               0,                              // ray time
+               OptixVisibilityMask(1),         // visibility mask
+               OPTIX_RAY_FLAG_DISABLE_ANYHIT,  // ray flags
+               0,                              // SBT offset
+               1,                              // SBT stride
+               0,                              // miss SBT offset
+               p0, p1, p2, p3);
+    payload.hit_id = static_cast<int32_t>(p0);
+    payload.hit_t  = __uint_as_float(p1);
+    payload.hit_u  = __uint_as_float(p2);
+    payload.hit_v  = __uint_as_float(p3);
+};
+
+
+Intersection
+Scene::intersect(const Ray& r, const float tmax, const unsigned skipID1,
+                 const unsigned /*skipID2*/) const
+{
+    // Trace the ray against the scene. If the ID for the hit matches skipID1,
+    // "nudge" the ray forward by adjusting tmin to exclude the hit interval
+    // and try again.
+    const int num_attempts = 2;
+    float tmin             = 0.0f;
+    for (int attempt = 0; attempt < num_attempts; ++attempt) {
+        Payload payload;
+        payload.hit_id = ~0u;
+        trace_ray(handle, payload, V3_TO_F3(r.origin), V3_TO_F3(r.direction),
+                  tmin);
+        if (payload.hit_id == skipID1) {
+            tmin = __uint_as_float(__float_as_uint(payload.hit_t) + 1u);
+        } else if (payload.hit_id != ~0u) {
+            return { payload.hit_t, payload.hit_u, payload.hit_v,
+                     payload.hit_id };
+        }
+    }
+    return { std::numeric_limits<float>::infinity() };
+}
+
+
+static inline __device__ void
+setupRaytracer(SimpleRaytracer& raytracer, const bool bg_only)
+{
+    // Background
+    raytracer.background = {};
+    raytracer.background.set_variables((Vec3*)render_params.bg_values,
+                                       (float*)render_params.bg_rows,
+                                       (float*)render_params.bg_cols,
+                                       render_params.bg_res);
+
+    raytracer.backgroundResolution = render_params.bg_id >= 0
+                                         ? render_params.bg_res
+                                         : 0;
+    raytracer.backgroundShaderID   = render_params.bg_id;
+
+    if (bg_only)
+        return;
+
+    // Parameters
+    raytracer.aa                = render_params.aa;
+    raytracer.no_jitter         = render_params.no_jitter;
+    raytracer.max_bounces       = render_params.max_bounces;
+    raytracer.rr_depth          = 5;
+    raytracer.show_albedo_scale = render_params.show_albedo_scale;
+    raytracer.show_globals      = render_params.show_globals;
+
+    // Pointers
+    raytracer.lightprims_size = render_params.lightprims_size;
+    raytracer.m_lightprims    = reinterpret_cast<unsigned int*>(
+        render_params.lightprims);
+    raytracer.m_mesh_surfacearea = reinterpret_cast<const float*>(
+        render_params.surfacearea);
+    raytracer.m_meshids = reinterpret_cast<const int*>(render_params.mesh_ids);
+    raytracer.m_shader_is_light = reinterpret_cast<int*>(
+        render_params.shader_is_light);
+
+    // Scene
+    raytracer.scene       = {};
+    raytracer.scene.verts = reinterpret_cast<const OSL::Vec3*>(
+        render_params.verts);
+    raytracer.scene.normals = reinterpret_cast<const OSL::Vec3*>(
+        render_params.normals);
+    raytracer.scene.uvs = reinterpret_cast<const OSL::Vec2*>(render_params.uvs);
+    raytracer.scene.triangles = reinterpret_cast<const TriangleIndices*>(
+        render_params.triangles);
+    raytracer.scene.uv_triangles = reinterpret_cast<const TriangleIndices*>(
+        render_params.uv_indices);
+    raytracer.scene.n_triangles = reinterpret_cast<const TriangleIndices*>(
+        render_params.normal_indices);
+    raytracer.scene.shaderids = reinterpret_cast<int*>(
+        render_params.shader_ids);
+    raytracer.scene.handle = render_params.traversal_handle;
+
+    // Camera
+    const Vec3 eye          = F3_TO_V3(render_params.eye);
+    const Vec3 dir          = F3_TO_V3(render_params.dir);
+    const Vec3 up           = F3_TO_V3(render_params.up);
+    const float fov         = render_params.fov;
+    const uint3 launch_dims = optixGetLaunchDimensions();
+    raytracer.camera.resolution(launch_dims.x, launch_dims.y);
+    raytracer.camera.lookat(eye, dir, up, fov);
+    raytracer.camera.finalize();
+}
+
+//------------------------------------------------------------------------------
+
+// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
+// let's do the texture look-ups in this file.
+extern "C" __device__ float4
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx,
+                float dsdy, float dtdy)
+{
+    const float2 dx           = { dsdx, dtdx };
+    const float2 dy           = { dsdy, dtdy };
+    cudaTextureObject_t texID = cudaTextureObject_t(handle);
+    return tex2DGrad<float4>(texID, s, t, dx, dy);
+}
+
+
+//
+// OptiX Programs
+//
+
+
 extern "C" __global__ void
 __miss__()
 {
@@ -51,12 +229,32 @@ __miss__()
 extern "C" __global__ void
 __raygen__setglobals()
 {
+    uint3 launch_dims  = optixGetLaunchDimensions();
+    uint3 launch_index = optixGetLaunchIndex();
+
     // Set global variables
-    OSL::pvt::osl_printf_buffer_start = render_params.osl_printf_buffer_start;
-    OSL::pvt::osl_printf_buffer_end   = render_params.osl_printf_buffer_end;
-    OSL::pvt::s_color_system          = render_params.color_system;
-    OSL::pvt::test_str_1              = render_params.test_str_1;
-    OSL::pvt::test_str_2              = render_params.test_str_2;
+    if (launch_index.x == 0 && launch_index.y == 0) {
+        OSL::pvt::osl_printf_buffer_start
+            = render_params.osl_printf_buffer_start;
+        OSL::pvt::osl_printf_buffer_end = render_params.osl_printf_buffer_end;
+        OSL::pvt::s_color_system        = render_params.color_system;
+        OSL::pvt::test_str_1            = render_params.test_str_1;
+        OSL::pvt::test_str_2            = render_params.test_str_2;
+    }
+
+    if (render_params.bg_id < 0)
+        return;
+
+    SimpleRaytracer raytracer;
+    setupRaytracer(raytracer, /*bg_only=*/true);
+
+    auto evaler = [&](const Dual2<Vec3>& dir) {
+        return raytracer.eval_background(dir, nullptr);
+    };
+
+    // Background::prepare_cuda must run on a single warp
+    assert(launch_index.x < 32 && launch_index.y == 0);
+    raytracer.background.prepare_cuda(launch_dims.x, launch_index.x, evaler);
 }
 
 
@@ -67,37 +265,50 @@ __miss__setglobals()
 
 
 extern "C" __global__ void
-__raygen__()
+__closesthit__deferred()
 {
-    uint3 launch_dims  = optixGetLaunchDimensions();
-    uint3 launch_index = optixGetLaunchIndex();
-    const float3 eye   = render_params.eye;
-    const float3 dir   = render_params.dir;
-    const float3 cx    = render_params.cx;
-    const float3 cy    = render_params.cy;
-    const float invw   = render_params.invw;
-    const float invh   = render_params.invh;
-
-    // Compute the pixel coordinates
-    const float2 d = make_float2(static_cast<float>(launch_index.x) + 0.5f,
-                                 static_cast<float>(launch_index.y) + 0.5f);
-
-    // Make the ray for the current pixel
-    RayGeometry r;
-    r.origin    = eye;
-    r.direction = normalize(cx * (d.x * invw - 0.5f) + cy * (0.5f - d.y * invh)
-                            + dir);
-    optixTrace(render_params.traversal_handle, r.origin, r.direction, 1e-3f,
-               1e13f, 0, OptixVisibilityMask(1), OPTIX_RAY_FLAG_DISABLE_ANYHIT,
-               0, 1, 0);
+    const unsigned int hit_idx = optixGetPrimitiveIndex();
+    const float3 ray_direction = optixGetWorldRayDirection();
+    const float3 ray_origin    = optixGetWorldRayOrigin();
+    const float hit_t          = optixGetRayTmax();
+    const float2 barycentrics  = optixGetTriangleBarycentrics();
+    const float b1             = barycentrics.x;
+    const float b2             = barycentrics.y;
+
+    Payload payload;
+    payload.hit_t  = hit_t;
+    payload.hit_u  = b1;
+    payload.hit_v  = b2;
+    payload.hit_id = hit_idx;
+    payload.set();
 }
 
 
-// Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
-// let's do the texture look-ups in this file.
-extern "C" __device__ float4
-osl_tex2DLookup(void* handle, float s, float t)
+extern "C" __global__ void
+__raygen__deferred()
 {
-    cudaTextureObject_t texID = cudaTextureObject_t(handle);
-    return tex2D<float4>(texID, s, t);
+    SimpleRaytracer raytracer;
+    setupRaytracer(raytracer, /*bg_only=*/false);
+
+    const uint3 launch_index = optixGetLaunchIndex();
+    Color3 result = raytracer.antialias_pixel(launch_index.x, launch_index.y,
+                                              nullptr);
+
+    // Write the output
+    {
+        uint3 launch_dims     = optixGetLaunchDimensions();
+        uint3 launch_index    = optixGetLaunchIndex();
+        float3* output_buffer = reinterpret_cast<float3*>(
+            render_params.output_buffer);
+        int pixel            = launch_index.y * launch_dims.x + launch_index.x;
+        output_buffer[pixel] = C3_TO_F3(result);
+    }
 }
+
+//------------------------------------------------------------------------------
+
+// We need to pull in the definition of SimpleRaytracer::subpixel_radiance(),
+// which is shared between the host and CUDA renderers.
+#include "../simpleraytracer.cpp"
+
+//------------------------------------------------------------------------------
diff --git a/src/testrender/cuda/optix_raytracer.h b/src/testrender/cuda/optix_raytracer.h
new file mode 100644
index 000000000..34a8eef90
--- /dev/null
+++ b/src/testrender/cuda/optix_raytracer.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <OSL/oslconfig.h>
+#include <optix.h>
+
+#include "../background.h"
+#include "../raytracer.h"
+#include "../sampling.h"
+#include "rend_lib.h"
+
+#include <cstdint>
+
+#ifdef __CUDACC__
+
+struct Payload {
+    uint32_t hit_id;
+    float hit_t;
+    float hit_u;
+    float hit_v;
+
+    __forceinline__ __device__ void set()
+    {
+        optixSetPayload_0(hit_id);
+        optixSetPayload_1(__float_as_uint(hit_t));
+        optixSetPayload_2(__float_as_uint(hit_u));
+        optixSetPayload_3(__float_as_uint(hit_v));
+    }
+
+    __forceinline__ __device__ void get()
+    {
+        hit_id = static_cast<int32_t>(optixGetPayload_0());
+        hit_t  = __uint_as_float(optixGetPayload_1());
+        hit_u  = __uint_as_float(optixGetPayload_2());
+        hit_v  = __uint_as_float(optixGetPayload_3());
+    }
+};
+
+OSL_NAMESPACE_ENTER
+
+struct SimpleRaytracer {
+    using ShadingContext = ShadingContextCUDA;
+
+    Background background;
+    Camera camera;
+    Scene scene;
+    int aa                          = 1;
+    bool no_jitter                  = false;
+    int backgroundResolution        = 1024;
+    int backgroundShaderID          = -1;
+    int max_bounces                 = 1000000;
+    int rr_depth                    = 5;
+    float show_albedo_scale         = 0.0f;
+    int show_globals                = 0;
+    const int* m_shader_is_light    = nullptr;
+    const unsigned* m_lightprims    = nullptr;
+    size_t lightprims_size          = 0;
+    const int* m_shaderids          = nullptr;
+    const int* m_meshids            = nullptr;
+    const float* m_mesh_surfacearea = nullptr;
+
+    OSL_HOSTDEVICE void globals_from_hit(OSL_CUDA::ShaderGlobals& sg,
+                                         const Ray& r, const Dual2<float>& t,
+                                         int id, float u, float v);
+    OSL_HOSTDEVICE Vec3 eval_background(const Dual2<Vec3>& dir,
+                                        ShadingContext* ctx, int bounce = -1);
+    OSL_HOSTDEVICE Color3 subpixel_radiance(float x, float y, Sampler& sampler,
+                                            ShadingContext* ctx = nullptr);
+    OSL_HOSTDEVICE Color3 antialias_pixel(int x, int y,
+                                          ShadingContext* ctx = nullptr);
+};
+
+OSL_NAMESPACE_EXIT
+
+#endif  // #ifdef __CUDACC__
diff --git a/src/testrender/cuda/rend_lib.cu b/src/testrender/cuda/rend_lib.cu
index 72ee029f3..fb51605e9 100644
--- a/src/testrender/cuda/rend_lib.cu
+++ b/src/testrender/cuda/rend_lib.cu
@@ -58,7 +58,7 @@ closure_component_allot(void* pool, int id, size_t prim_size,
     ((OSL::ClosureComponent*)pool)->id = id;
     ((OSL::ClosureComponent*)pool)->w  = w;
 
-    size_t needed = (sizeof(OSL::ClosureComponent) - sizeof(void*) + prim_size
+    size_t needed = (sizeof(OSL::ClosureComponent) + prim_size
                      + (alignof(OSL::ClosureComponent) - 1))
                     & ~(alignof(OSL::ClosureComponent) - 1);
     char* char_ptr = (char*)pool;
@@ -120,7 +120,7 @@ closure_add_allot(void* pool, OSL::ClosureColor* a, OSL::ClosureColor* b)
 __device__ void*
 osl_allocate_closure_component(void* sg_, int id, int size)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     OSL::Color3 w = OSL::Color3(1, 1, 1);
     // Fix up the alignment
@@ -141,7 +141,7 @@ __device__ void*
 osl_allocate_weighted_closure_component(void* sg_, int id, int size,
                                         const void* w)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     const OSL::Color3* wc
         = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float));
@@ -166,7 +166,7 @@ osl_allocate_weighted_closure_component(void* sg_, int id, int size,
 __device__ void*
 osl_mul_closure_color(void* sg_, void* a, const void* w)
 {
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
     const OSL::Color3* wc
         = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float));
 
@@ -198,7 +198,7 @@ osl_mul_closure_float(void* sg_, void* a, float w)
 {
     a = __builtin_assume_aligned(a, alignof(float));
 
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     if (a == NULL || w == 0.0f) {
         return NULL;
@@ -226,7 +226,7 @@ osl_add_closure_closure(void* sg_, void* a, void* b)
     a = __builtin_assume_aligned(a, alignof(float));
     b = __builtin_assume_aligned(b, alignof(float));
 
-    ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_;
+    OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_;
 
     if (a == NULL) {
         return b;
@@ -383,9 +383,9 @@ osl_range_check_err(int indexvalue, int length, OSL::ustringhash_pod symname,
 __device__ int
 osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_)
 {
-    r                     = __builtin_assume_aligned(r, alignof(float));
-    OSL::ustringhash from = OSL::ustringhash_from(from_);
-    ShaderGlobals* sg     = (ShaderGlobals*)sg_;
+    r                           = __builtin_assume_aligned(r, alignof(float));
+    OSL::ustringhash from       = OSL::ustringhash_from(from_);
+    OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_;
     if (from == OSL::Hashes::common) {
         MAT(r).makeIdentity();
         return true;
@@ -428,9 +428,9 @@ osl_get_matrix(void* sg_, void* r, OSL::ustringhash_pod from_)
 __device__ int
 osl_get_inverse_matrix(void* sg_, void* r, OSL::ustringhash_pod to_)
 {
-    r                   = __builtin_assume_aligned(r, alignof(float));
-    OSL::ustringhash to = OSL::ustringhash_from(to_);
-    ShaderGlobals* sg   = (ShaderGlobals*)sg_;
+    r                           = __builtin_assume_aligned(r, alignof(float));
+    OSL::ustringhash to         = OSL::ustringhash_from(to_);
+    OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_;
     if (to == OSL::Hashes::common) {
         MAT(r).makeIdentity();
         return true;
diff --git a/src/testrender/cuda/rend_lib.h b/src/testrender/cuda/rend_lib.h
index 1e0f9bcc8..d6aea2750 100644
--- a/src/testrender/cuda/rend_lib.h
+++ b/src/testrender/cuda/rend_lib.h
@@ -6,11 +6,11 @@
 
 #include <OSL/oslconfig.h>
 
-#include <Imath/half.h>
-
 #include <OSL/hashes.h>
 #include <OSL/oslexec.h>
 
+#include "../raytracer.h"
+
 
 OSL_NAMESPACE_ENTER
 
@@ -45,19 +45,20 @@ namespace {  // anonymous namespace
 struct ShadingContextCUDA {};
 
 
+namespace OSL_CUDA {
 struct ShaderGlobals {
-    float3 P, dPdx, dPdy;
-    float3 dPdz;
-    float3 I, dIdx, dIdy;
-    float3 N;
-    float3 Ng;
+    OSL::Vec3 P, dPdx, dPdy;
+    OSL::Vec3 dPdz;
+    OSL::Vec3 I, dIdx, dIdy;
+    OSL::Vec3 N;
+    OSL::Vec3 Ng;
     float u, dudx, dudy;
     float v, dvdx, dvdy;
-    float3 dPdu, dPdv;
+    OSL::Vec3 dPdu, dPdv;
     float time;
     float dtime;
-    float3 dPdtime;
-    float3 Ps, dPsdx, dPsdy;
+    OSL::Vec3 dPdtime;
+    OSL::Vec3 Ps, dPsdx, dPsdy;
     void* renderstate;
     void* tracedata;
     void* objdata;
@@ -73,88 +74,7 @@ struct ShaderGlobals {
     int raytype;
     int flipHandedness;
     int backfacing;
-    int shaderID;
-};
-
-
-enum RayType {
-    CAMERA       = 1,
-    SHADOW       = 2,
-    REFLECTION   = 4,
-    REFRACTION   = 8,
-    DIFFUSE      = 16,
-    GLOSSY       = 32,
-    SUBSURFACE   = 64,
-    DISPLACEMENT = 128
 };
-
-
-// Closures supported by the OSL sample renderer.  This list is mostly aspirational.
-enum ClosureIDs {
-    EMISSION_ID = 1,
-    BACKGROUND_ID,
-    DIFFUSE_ID,
-    OREN_NAYAR_ID,
-    TRANSLUCENT_ID,
-    PHONG_ID,
-    WARD_ID,
-    MICROFACET_ID,
-    REFLECTION_ID,
-    FRESNEL_REFLECTION_ID,
-    REFRACTION_ID,
-    TRANSPARENT_ID,
-    DEBUG_ID,
-    HOLDOUT_ID,
-};
-
-
-// ========================================
-//
-// Some helper vector functions
-//
-static __forceinline__ __device__ float3
-operator*(const float a, const float3& b)
-{
-    return make_float3(a * b.x, a * b.y, a * b.z);
-}
-
-static __forceinline__ __device__ float3
-operator*(const float3& a, const float b)
-{
-    return make_float3(a.x * b, a.y * b, a.z * b);
-}
-
-static __forceinline__ __device__ float3
-operator+(const float3& a, const float3& b)
-{
-    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
-}
-
-static __forceinline__ __device__ float3
-operator-(const float3& a, const float3& b)
-{
-    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
-}
-
-static __forceinline__ __device__ float3
-operator-(const float3& a)
-{
-    return make_float3(-a.x, -a.y, -a.z);
-}
-
-static __forceinline__ __device__ float
-dot(const float3& a, const float3& b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-static __forceinline__ __device__ float3
-normalize(const float3& v)
-{
-    float invLen = 1.0f / sqrtf(dot(v, v));
-    return invLen * v;
-}
-//
-// ========================================
+}  // namespace OSL_CUDA
 
 }  // anonymous namespace
diff --git a/src/testrender/cuda/wrapper.cu b/src/testrender/cuda/wrapper.cu
deleted file mode 100644
index a2501ff97..000000000
--- a/src/testrender/cuda/wrapper.cu
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright Contributors to the Open Shading Language project.
-// SPDX-License-Identifier: BSD-3-Clause
-// https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
-
-#include <OSL/hashes.h>
-#include <optix.h>
-
-#include <cuda_runtime.h>
-#include <optix.h>
-
-#include <OSL/oslclosure.h>
-
-#include "rend_lib.h"
-#include "util.h"
-
-#include "../render_params.h"
-
-
-extern "C" {
-__device__ __constant__ RenderParams render_params;
-}
-
-
-extern "C" __global__ void
-__anyhit__any_hit_shadow()
-{
-    optixTerminateRay();
-}
-
-
-
-static __device__ void
-globals_from_hit(ShaderGlobals& sg)
-{
-    // Setup the ShaderGlobals
-    const int primID           = optixGetPrimitiveIndex();
-    const float3 ray_direction = optixGetWorldRayDirection();
-    const float3 ray_origin    = optixGetWorldRayOrigin();
-    const float t_hit          = optixGetRayTmax();
-    const int shader_id        = reinterpret_cast<int*>(
-        render_params.shader_ids)[primID];
-
-    const OSL::Vec3* verts = reinterpret_cast<const OSL::Vec3*>(
-        render_params.verts);
-    const OSL::Vec3* normals = reinterpret_cast<const OSL::Vec3*>(
-        render_params.normals);
-    const OSL::Vec2* uvs = reinterpret_cast<const OSL::Vec2*>(
-        render_params.uvs);
-    const int3* triangles = reinterpret_cast<const int3*>(
-        render_params.triangles);
-    const int3* n_triangles = reinterpret_cast<const int3*>(
-        render_params.normal_indices);
-    const int3* uv_triangles = reinterpret_cast<const int3*>(
-        render_params.uv_indices);
-    const int* mesh_ids = reinterpret_cast<const int*>(render_params.mesh_ids);
-    const float* surfacearea = reinterpret_cast<const float*>(
-        render_params.surfacearea);
-
-    // Calculate UV and its derivatives
-    const float2 barycentrics = optixGetTriangleBarycentrics();
-    const float b1            = barycentrics.x;
-    const float b2            = barycentrics.y;
-    const float b0            = 1.0f - (b1 + b2);
-
-    const OSL::Vec2 ta = uvs[uv_triangles[primID].x];
-    const OSL::Vec2 tb = uvs[uv_triangles[primID].y];
-    const OSL::Vec2 tc = uvs[uv_triangles[primID].z];
-    const OSL::Vec2 uv = b0 * ta + b1 * tb + b2 * tc;
-    const float u      = uv.x;
-    const float v      = uv.y;
-
-    const OSL::Vec3 va = verts[triangles[primID].x];
-    const OSL::Vec3 vb = verts[triangles[primID].y];
-    const OSL::Vec3 vc = verts[triangles[primID].z];
-
-    const OSL::Vec2 dt02 = ta - tc, dt12 = tb - tc;
-    const OSL::Vec3 dp02 = va - vc, dp12 = vb - vc;
-
-    OSL::Vec3 dPdu, dPdv;
-    const float det = dt02.x * dt12.y - dt02.y * dt12.x;
-    if (det != 0.0f) {
-        float invdet = 1.0f / det;
-        dPdu         = (dt12.y * dp02 - dt02.y * dp12) * invdet;
-        dPdv         = (-dt12.x * dp02 + dt02.x * dp12) * invdet;
-    }
-
-    // Calculate the normals
-    const OSL::Vec3 Ng = (va - vb).cross(va - vc).normalize();
-    OSL::Vec3 N;
-    if (n_triangles[primID].x < 0.0f) {
-        N = Ng;
-    } else {
-        const OSL::Vec3 na = normals[n_triangles[primID].x];
-        const OSL::Vec3 nb = normals[n_triangles[primID].y];
-        const OSL::Vec3 nc = normals[n_triangles[primID].z];
-        N                  = ((1 - u - v) * na + u * nb + v * nc).normalize();
-    }
-
-    sg.I = ray_direction;
-    sg.N = normalize(
-        optixTransformNormalFromObjectToWorldSpace(*(float3*)(&N)));
-    sg.Ng = normalize(
-        optixTransformNormalFromObjectToWorldSpace(*(float3*)(&Ng)));
-    sg.P           = ray_origin + t_hit * ray_direction;
-    sg.dPdu        = *(float3*)(&dPdu);
-    sg.dPdv        = *(float3*)(&dPdv);
-    sg.u           = u;
-    sg.v           = v;
-    sg.Ci          = NULL;
-    sg.surfacearea = surfacearea[mesh_ids[primID]];
-    sg.backfacing  = dot(sg.N, sg.I) > 0.0f;
-    sg.shaderID    = shader_id;
-
-    if (sg.backfacing) {
-        sg.N  = -sg.N;
-        sg.Ng = -sg.Ng;
-    }
-
-    // NB: These variables are not used in the current iteration of the sample
-    sg.raytype        = CAMERA;
-    sg.flipHandedness = 0;
-}
-
-
-
-static __device__ float3
-process_closure(const OSL::ClosureColor* closure_tree)
-{
-    OSL::Color3 result = OSL::Color3(0.0f);
-
-    if (!closure_tree) {
-        return make_float3(result.x, result.y, result.z);
-    }
-
-    // The depth of the closure tree must not exceed the stack size.
-    // A stack size of 8 is probably quite generous for relatively
-    // balanced trees.
-    const int STACK_SIZE = 8;
-
-    // Non-recursive traversal stack
-    int stack_idx = 0;
-    const OSL::ClosureColor* ptr_stack[STACK_SIZE];
-    OSL::Color3 weight_stack[STACK_SIZE];
-
-    // Shading accumulator
-    OSL::Color3 weight = OSL::Color3(1.0f);
-
-    const void* cur = closure_tree;
-    while (cur) {
-        switch (((OSL::ClosureColor*)cur)->id) {
-        case OSL::ClosureColor::ADD: {
-            ptr_stack[stack_idx]      = ((OSL::ClosureAdd*)cur)->closureB;
-            weight_stack[stack_idx++] = weight;
-            cur                       = ((OSL::ClosureAdd*)cur)->closureA;
-            break;
-        }
-
-        case OSL::ClosureColor::MUL: {
-            weight *= ((OSL::ClosureMul*)cur)->weight;
-            cur = ((OSL::ClosureMul*)cur)->closure;
-            break;
-        }
-
-        case EMISSION_ID: {
-            cur = NULL;
-            break;
-        }
-
-        case DIFFUSE_ID:
-        case OREN_NAYAR_ID:
-        case PHONG_ID:
-        case WARD_ID:
-        case REFLECTION_ID:
-        case REFRACTION_ID:
-        case FRESNEL_REFLECTION_ID: {
-            result += ((OSL::ClosureComponent*)cur)->w * weight;
-            cur = NULL;
-            break;
-        }
-
-        case MICROFACET_ID: {
-            const char* mem = (const char*)((OSL::ClosureComponent*)cur)->data();
-            OSL::ustringhash dist_uh = *(OSL::ustringhash*)&mem[0];
-
-            if (dist_uh == OSL::Hashes::default_)
-                return make_float3(0.0f, 1.0f, 1.0f);
-            else
-                return make_float3(1.0f, 0.0f, 1.0f);
-
-            break;
-        }
-
-        default: cur = NULL; break;
-        }
-
-        if (cur == NULL && stack_idx > 0) {
-            cur    = ptr_stack[--stack_idx];
-            weight = weight_stack[stack_idx];
-        }
-    }
-
-    return make_float3(result.x, result.y, result.z);
-}
-
-
-
-extern "C" __global__ void
-__closesthit__closest_hit_osl()
-{
-    // TODO: Fixed-sized allocations can easily be exceeded by arbitrary shader
-    //       networks, so there should be (at least) some mechanism to issue a
-    //       warning or error if the closure or param storage can possibly be
-    //       exceeded.
-    alignas(8) char closure_pool[256];
-
-    ShaderGlobals sg;
-    globals_from_hit(sg);
-
-    // Pack the "closure pool" into one of the ShaderGlobals pointers
-    *(int*)&closure_pool[0] = 0;
-    sg.renderstate          = &closure_pool[0];
-
-    // Pack the pointers to the options structs in a faux "context",
-    // which is a rough stand-in for the host ShadingContext.
-    ShadingContextCUDA shading_context;
-
-    sg.context = &shading_context;
-
-    // Run the OSL callable
-    void* interactive_ptr = reinterpret_cast<void**>(
-        render_params.interactive_params)[sg.shaderID];
-    const unsigned int shaderIdx = sg.shaderID + 0u;
-    optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
-        shaderIdx, &sg /*shaderglobals_ptr*/, nullptr /*groupdata_ptr*/,
-        nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
-        0 /*shadeindex - unused*/, interactive_ptr /*interactive_params_ptr*/);
-
-    float3 result      = process_closure((OSL::ClosureColor*)sg.Ci);
-    uint3 launch_dims  = optixGetLaunchDimensions();
-    uint3 launch_index = optixGetLaunchIndex();
-
-    float3* output_buffer = reinterpret_cast<float3*>(
-        render_params.output_buffer);
-    int pixel            = launch_index.y * launch_dims.x + launch_index.x;
-    output_buffer[pixel] = make_float3(result.x, result.y, result.z);
-}
diff --git a/src/testrender/cuda/wrapper.h b/src/testrender/cuda/wrapper.h
deleted file mode 100644
index c939f6064..000000000
--- a/src/testrender/cuda/wrapper.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#pragma once
-
-#define RAYTRACER_HIT_QUAD   0
-#define RAYTRACER_HIT_SPHERE 1
diff --git a/src/testrender/optics.h b/src/testrender/optics.h
index 5a709a065..a266eef2e 100644
--- a/src/testrender/optics.h
+++ b/src/testrender/optics.h
@@ -9,7 +9,7 @@
 
 OSL_NAMESPACE_ENTER
 
-inline float
+static inline OSL_HOSTDEVICE float
 fresnel_dielectric(float cosi, float eta)
 {
     // special case: ignore fresnel
@@ -30,7 +30,7 @@ fresnel_dielectric(float cosi, float eta)
     return 1.0f;  // TIR (no refracted component)
 }
 
-inline float
+static inline OSL_HOSTDEVICE float
 fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T)
 {
     // compute refracted direction and fresnel term
@@ -63,7 +63,7 @@ fresnel_refraction(const Vec3& I, const Vec3& N, float eta, Vec3& T)
     return 0;
 }
 
-Color3
+static inline OSL_HOSTDEVICE Color3
 fresnel_conductor(float cos_theta, Color3 n, Color3 k)
 {
     cos_theta       = OIIO::clamp(cos_theta, 0.0f, 1.0f);
@@ -89,7 +89,7 @@ fresnel_conductor(float cos_theta, Color3 n, Color3 k)
     return 0.5f * (rp + rs);
 }
 
-inline float
+static inline OSL_HOSTDEVICE float
 fresnel_schlick(float cos_theta, float F0, float F90)
 {
     float x  = OIIO::clamp(1.0f - cos_theta, 0.0f, 1.0f);
@@ -99,7 +99,7 @@ fresnel_schlick(float cos_theta, float F0, float F90)
     return OIIO::lerp(F0, F90, x5);
 }
 
-inline Color3
+static inline OSL_HOSTDEVICE Color3
 fresnel_generalized_schlick(float cos_theta, Color3 F0, Color3 F90,
                             float exponent)
 {
diff --git a/src/testrender/optixraytracer.cpp b/src/testrender/optixraytracer.cpp
index 4310bd05c..ab5aff9d1 100644
--- a/src/testrender/optixraytracer.cpp
+++ b/src/testrender/optixraytracer.cpp
@@ -14,6 +14,7 @@
 #include "render_params.h"
 
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <optix_function_table_definition.h>
 #include <optix_stack_size.h>
 #include <optix_stubs.h>
@@ -82,6 +83,12 @@ OSL_NAMESPACE_ENTER
         }                                                                  \
     }
 
+
+#define DEVICE_ALLOC(size) reinterpret_cast<CUdeviceptr>(device_alloc(size))
+#define COPY_TO_DEVICE(dst_device, src_host, size) \
+    copy_to_device(reinterpret_cast<void*>(dst_device), src_host, size)
+
+
 static void
 context_log_cb(unsigned int level, const char* tag, const char* message,
                void* /*cbdata */)
@@ -115,8 +122,10 @@ OptixRaytracer::~OptixRaytracer()
 {
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
-    for (CUdeviceptr ptr : device_ptrs)
+    for (CUdeviceptr ptr : m_ptrs_to_free)
         cudaFree(reinterpret_cast<void*>(ptr));
+    for (cudaArray_t arr : m_arrays_to_free)
+        cudaFreeArray(arr);
 }
 
 
@@ -130,6 +139,7 @@ OptixRaytracer::device_alloc(size_t size)
         errhandler().errorfmt("cudaMalloc({}) failed with error: {}\n", size,
                               cudaGetErrorString(res));
     }
+    m_ptrs_to_free.push_back(reinterpret_cast<CUdeviceptr>(ptr));
     return ptr;
 }
 
@@ -234,18 +244,13 @@ OptixRaytracer::synch_attributes()
         const size_t podDataSize = cpuDataSize
                                    - sizeof(ustringhash) * numStrings;
 
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_color_system),
-                       podDataSize + sizeof(ustringhash_pod) * numStrings));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_color_system), colorSys,
-                              podDataSize, cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_color_system);
+        d_color_system = DEVICE_ALLOC(podDataSize
+                                      + sizeof(ustringhash_pod) * numStrings);
+        COPY_TO_DEVICE(d_color_system, colorSys, podDataSize);
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_osl_printf_buffer),
-                              OSL_PRINTF_BUFFER_SIZE));
+        d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE);
         CUDA_CHECK(cudaMemset(reinterpret_cast<void*>(d_osl_printf_buffer), 0,
                               OSL_PRINTF_BUFFER_SIZE));
-        device_ptrs.push_back(d_osl_printf_buffer);
 
         // then copy the device string to the end, first strings starting at dataPtr - (numStrings)
         // FIXME -- Should probably handle alignment better.
@@ -257,8 +262,7 @@ OptixRaytracer::synch_attributes()
         for (const ustringhash* end = cpuStringHash + numStrings;
              cpuStringHash < end; ++cpuStringHash) {
             ustringhash_pod devStr = cpuStringHash->hash();
-            CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(gpuStrings), &devStr,
-                                  sizeof(devStr), cudaMemcpyHostToDevice));
+            COPY_TO_DEVICE(gpuStrings, &devStr, sizeof(devStr));
             gpuStrings += sizeof(ustringhash_pod);
         }
     }
@@ -337,8 +341,8 @@ OptixRaytracer::create_modules()
     m_pipeline_compile_options.traversableGraphFlags
         = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
     m_pipeline_compile_options.usesMotionBlur     = false;
-    m_pipeline_compile_options.numPayloadValues   = 0;
-    m_pipeline_compile_options.numAttributeValues = 2;
+    m_pipeline_compile_options.numPayloadValues   = 4;
+    m_pipeline_compile_options.numAttributeValues = 3;
     m_pipeline_compile_options.exceptionFlags
         = OPTIX_EXCEPTION_FLAG_STACK_OVERFLOW;
     m_pipeline_compile_options.pipelineLaunchParamsVariableName
@@ -360,8 +364,6 @@ OptixRaytracer::create_modules()
 
     load_optix_module("optix_raytracer.ptx", &m_module_compile_options,
                       &m_pipeline_compile_options, &m_program_module);
-    load_optix_module("wrapper.ptx", &m_module_compile_options,
-                      &m_pipeline_compile_options, &m_wrapper_module);
     load_optix_module("rend_lib_testrender.ptx", &m_module_compile_options,
                       &m_pipeline_compile_options, &m_rend_lib_module);
 
@@ -402,7 +404,7 @@ OptixRaytracer::create_programs()
     OptixProgramGroupDesc raygen_desc    = {};
     raygen_desc.kind                     = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
     raygen_desc.raygen.module            = m_program_module;
-    raygen_desc.raygen.entryFunctionName = "__raygen__";
+    raygen_desc.raygen.entryFunctionName = "__raygen__deferred";
     create_optix_pg(&raygen_desc, 1, &m_program_options, &m_raygen_group);
 
     // Set Globals Raygen group
@@ -439,9 +441,8 @@ OptixRaytracer::create_programs()
     // Hitgroup -- triangles
     OptixProgramGroupDesc tri_hitgroup_desc = {};
     tri_hitgroup_desc.kind                  = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
-    tri_hitgroup_desc.hitgroup.moduleCH     = m_wrapper_module;
-    tri_hitgroup_desc.hitgroup.entryFunctionNameCH
-        = "__closesthit__closest_hit_osl";
+    tri_hitgroup_desc.hitgroup.moduleCH     = m_program_module;
+    tri_hitgroup_desc.hitgroup.entryFunctionNameCH = "__closesthit__deferred";
     create_optix_pg(&tri_hitgroup_desc, 1, &m_program_options,
                     &m_closesthit_group);
 
@@ -525,7 +526,7 @@ OptixRaytracer::create_shaders()
 
         // Create Programs from the init and group_entry functions,
         // and set the OSL functions as Callable Programs so that they
-        // can be executed by the closest hit program in the wrapper
+        // can be executed by the closest hit program.
         sizeof_msg_log = sizeof(msg_log);
         OPTIX_CHECK_MSG(optixModuleCreateFn(m_optix_ctx,
                                             &m_module_compile_options,
@@ -556,13 +557,10 @@ OptixRaytracer::create_shaders()
     }
 
     // Upload per-material interactive buffer table
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_interactive_params),
-                          sizeof(void*) * material_interactive_params.size()));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_interactive_params),
-                          material_interactive_params.data(),
-                          sizeof(void*) * material_interactive_params.size(),
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_interactive_params);
+    d_interactive_params = DEVICE_ALLOC(sizeof(void*)
+                                        * material_interactive_params.size());
+    COPY_TO_DEVICE(d_interactive_params, material_interactive_params.data(),
+                   sizeof(void*) * material_interactive_params.size());
 }
 
 
@@ -650,12 +648,8 @@ OptixRaytracer::create_sbt()
         GenericRecord raygen_record;
         CUdeviceptr d_raygen_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_raygen_group, &raygen_record));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygen_record),
-                              sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygen_record),
-                              &raygen_record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_raygen_record);
+        d_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_raygen_record, &raygen_record, sizeof(GenericRecord));
 
         m_optix_sbt.raygenRecord = d_raygen_record;
     }
@@ -665,12 +659,8 @@ OptixRaytracer::create_sbt()
         GenericRecord miss_record;
         CUdeviceptr d_miss_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_miss_group, &miss_record));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_miss_record),
-                              sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_miss_record),
-                              &miss_record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_miss_record);
+        d_miss_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_miss_record, &miss_record, sizeof(GenericRecord));
 
         m_optix_sbt.missRecordBase          = d_miss_record;
         m_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
@@ -685,13 +675,9 @@ OptixRaytracer::create_sbt()
         OPTIX_CHECK(
             optixSbtRecordPackHeader(m_closesthit_group, &hitgroup_records[0]));
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroup_records),
-                              nhitgroups * sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_hitgroup_records),
-                              &hitgroup_records[0],
-                              nhitgroups * sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_hitgroup_records);
+        d_hitgroup_records = DEVICE_ALLOC(nhitgroups * sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_hitgroup_records, &hitgroup_records[0],
+                       nhitgroups * sizeof(GenericRecord));
 
         m_optix_sbt.hitgroupRecordBase          = d_hitgroup_records;
         m_optix_sbt.hitgroupRecordStrideInBytes = sizeof(GenericRecord);
@@ -709,17 +695,18 @@ OptixRaytracer::create_sbt()
                                                  &callable_records[idx]));
         }
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callable_records),
-                              (nshaders) * sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_callable_records),
-                              callable_records.data(),
-                              (nshaders) * sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_callable_records);
+        d_callable_records = DEVICE_ALLOC((nshaders) * sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_callable_records, callable_records.data(),
+                       (nshaders) * sizeof(GenericRecord));
 
         m_optix_sbt.callablesRecordBase          = d_callable_records;
         m_optix_sbt.callablesRecordStrideInBytes = sizeof(GenericRecord);
         m_optix_sbt.callablesRecordCount         = nshaders;
+
+        m_setglobals_optix_sbt.callablesRecordBase = d_callable_records;
+        m_setglobals_optix_sbt.callablesRecordStrideInBytes = sizeof(
+            GenericRecord);
+        m_setglobals_optix_sbt.callablesRecordCount = nshaders;
     }
 
     // SetGlobals raygen
@@ -728,13 +715,9 @@ OptixRaytracer::create_sbt()
         CUdeviceptr d_setglobals_raygen_record;
         OPTIX_CHECK(
             optixSbtRecordPackHeader(m_setglobals_raygen_group, &record));
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygen_record),
-                       sizeof(GenericRecord)));
-        CUDA_CHECK(
-            cudaMemcpy(reinterpret_cast<void*>(d_setglobals_raygen_record),
-                       &record, sizeof(GenericRecord), cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_setglobals_raygen_record);
+        d_setglobals_raygen_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_setglobals_raygen_record, &record,
+                       sizeof(GenericRecord));
 
         m_setglobals_optix_sbt.raygenRecord = d_setglobals_raygen_record;
     }
@@ -744,13 +727,9 @@ OptixRaytracer::create_sbt()
         GenericRecord record;
         CUdeviceptr d_setglobals_miss_record;
         OPTIX_CHECK(optixSbtRecordPackHeader(m_setglobals_miss_group, &record));
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_setglobals_miss_record),
-                       sizeof(GenericRecord)));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_setglobals_miss_record),
-                              &record, sizeof(GenericRecord),
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_setglobals_miss_record);
+        d_setglobals_miss_record = DEVICE_ALLOC(sizeof(GenericRecord));
+        COPY_TO_DEVICE(d_setglobals_miss_record, &record,
+                       sizeof(GenericRecord));
 
         m_setglobals_optix_sbt.missRecordBase = d_setglobals_miss_record;
         m_setglobals_optix_sbt.missRecordStrideInBytes = sizeof(GenericRecord);
@@ -772,7 +751,6 @@ OptixRaytracer::cleanup_programs()
     m_shader_modules.clear();
 
     optixModuleDestroy(m_program_module);
-    optixModuleDestroy(m_wrapper_module);
     optixModuleDestroy(m_rend_lib_module);
     optixModuleDestroy(m_shadeops_module);
 }
@@ -791,20 +769,12 @@ OptixRaytracer::build_accel()
     accel_options.operation              = OPTIX_BUILD_OPERATION_BUILD;
 
     const size_t vertices_size = sizeof(Vec3) * scene.verts.size();
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_vertices), vertices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_vertices),
-                          scene.verts.data(), vertices_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_vertices);
+    d_vertices                 = DEVICE_ALLOC(vertices_size);
+    COPY_TO_DEVICE(d_vertices, scene.verts.data(), vertices_size);
 
     const size_t indices_size = scene.triangles.size() * sizeof(int32_t) * 3;
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_vert_indices), indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_vert_indices),
-                          scene.triangles.data(), indices_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_vert_indices);
+    d_vert_indices            = DEVICE_ALLOC(indices_size);
+    COPY_TO_DEVICE(d_vert_indices, scene.triangles.data(), indices_size);
 
     const uint32_t triangle_input_flags[1] = { OPTIX_GEOMETRY_FLAG_NONE };
     OptixBuildInput triangle_input         = {};
@@ -830,9 +800,7 @@ OptixRaytracer::build_accel()
     CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_temp_buffer),
                           gas_buffer_sizes.tempSizeInBytes));
 
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_accel_output_buffer),
-                          gas_buffer_sizes.outputSizeInBytes));
-    device_ptrs.push_back(d_accel_output_buffer);
+    d_accel_output_buffer = DEVICE_ALLOC(gas_buffer_sizes.outputSizeInBytes);
 
     OPTIX_CHECK(optixAccelBuild(
         m_optix_ctx, 0, &accel_options, &triangle_input, 1, d_temp_buffer,
@@ -844,65 +812,64 @@ OptixRaytracer::build_accel()
 
 
 
+void
+OptixRaytracer::prepare_background()
+{
+    if (getBackgroundShaderID() >= 0) {
+        const int bg_res = std::max<int>(32, getBackgroundResolution());
+        d_bg_values      = DEVICE_ALLOC(3 * sizeof(float) * bg_res * bg_res);
+        d_bg_rows        = DEVICE_ALLOC(sizeof(float) * bg_res);
+        d_bg_cols        = DEVICE_ALLOC(sizeof(float) * bg_res * bg_res);
+    }
+}
+
+
+
 void
 OptixRaytracer::upload_mesh_data()
 {
     // Upload the extra geometry data to the device
     const size_t uvs_size = sizeof(Vec2) * scene.uvs.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_uvs), uvs_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_uvs), scene.uvs.data(),
-                          uvs_size, cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_uvs);
+    d_uvs                 = DEVICE_ALLOC(uvs_size);
+    COPY_TO_DEVICE(d_uvs, scene.uvs.data(), uvs_size);
 
     const size_t uv_indices_size = scene.uv_triangles.size() * sizeof(int32_t)
                                    * 3;
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_uv_indices), uv_indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_uv_indices),
-                          scene.uv_triangles.data(), uv_indices_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_uv_indices);
+    d_uv_indices = DEVICE_ALLOC(uv_indices_size);
+    COPY_TO_DEVICE(d_uv_indices, scene.uv_triangles.data(), uv_indices_size);
 
     const size_t normals_size = sizeof(Vec3) * scene.normals.size();
     if (normals_size > 0) {
-        CUDA_CHECK(
-            cudaMalloc(reinterpret_cast<void**>(&d_normals), normals_size));
-        CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_normals),
-                              scene.normals.data(), normals_size,
-                              cudaMemcpyHostToDevice));
-        device_ptrs.push_back(d_normals);
+        d_normals = DEVICE_ALLOC(normals_size);
+        COPY_TO_DEVICE(d_normals, scene.normals.data(), normals_size);
     }
 
     const size_t normal_indices_size = scene.n_triangles.size()
                                        * sizeof(int32_t) * 3;
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_normal_indices),
-                          normal_indices_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_normal_indices),
-                          scene.n_triangles.data(), normal_indices_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_normal_indices);
+    d_normal_indices = DEVICE_ALLOC(normal_indices_size);
+    COPY_TO_DEVICE(d_normal_indices, scene.n_triangles.data(),
+                   normal_indices_size);
 
     const size_t shader_ids_size = scene.shaderids.size() * sizeof(int);
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_shader_ids), shader_ids_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader_ids),
-                          scene.shaderids.data(), shader_ids_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_shader_ids);
+    d_shader_ids                 = DEVICE_ALLOC(shader_ids_size);
+    COPY_TO_DEVICE(d_shader_ids, scene.shaderids.data(), shader_ids_size);
 
     // TODO: These could be packed, but for now just use ints instead of bools
     std::vector<int32_t> shader_is_light;
     for (const bool& is_light : OptixRaytracer::shader_is_light())
         shader_is_light.push_back(is_light);
-
     const size_t shader_is_light_size = shader_is_light.size()
                                         * sizeof(int32_t);
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_shader_is_light),
-                          shader_is_light_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader_is_light),
-                          shader_is_light.data(), shader_is_light_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_shader_is_light);
+    d_shader_is_light = DEVICE_ALLOC(shader_is_light_size);
+    COPY_TO_DEVICE(d_shader_is_light, shader_is_light.data(),
+                   shader_is_light_size);
+
+
+    const size_t lightprims_size = OptixRaytracer::lightprims().size()
+                                   * sizeof(uint32_t);
+    d_lightprims = DEVICE_ALLOC(lightprims_size);
+    COPY_TO_DEVICE(d_lightprims, OptixRaytracer::lightprims().data(),
+                   lightprims_size);
 
     // Copy the mesh ID for each triangle to the device
     std::vector<int> mesh_ids;
@@ -913,11 +880,8 @@ OptixRaytracer::upload_mesh_data()
         mesh_ids.push_back(meshid);
     }
     const size_t mesh_ids_size = mesh_ids.size() * sizeof(int32_t);
-    CUDA_CHECK(
-        cudaMalloc(reinterpret_cast<void**>(&d_mesh_ids), mesh_ids_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_mesh_ids), mesh_ids.data(),
-                          mesh_ids_size, cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_mesh_ids);
+    d_mesh_ids                 = DEVICE_ALLOC(mesh_ids_size);
+    COPY_TO_DEVICE(d_mesh_ids, mesh_ids.data(), mesh_ids_size);
 
     // Copy the mesh surface areas to the device
     std::vector<float> mesh_surfacearea;
@@ -936,12 +900,9 @@ OptixRaytracer::upload_mesh_data()
 
     const size_t mesh_surfacearea_size = mesh_surfacearea.size()
                                          * sizeof(float);
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_surfacearea),
-                          mesh_surfacearea_size));
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_surfacearea),
-                          mesh_surfacearea.data(), mesh_surfacearea_size,
-                          cudaMemcpyHostToDevice));
-    device_ptrs.push_back(d_surfacearea);
+    d_surfacearea = DEVICE_ALLOC(mesh_surfacearea_size);
+    COPY_TO_DEVICE(d_surfacearea, mesh_surfacearea.data(),
+                   mesh_surfacearea_size);
 }
 
 
@@ -962,54 +923,86 @@ OptixRaytracer::good(TextureHandle* handle OSL_MAYBE_UNUSED)
 RendererServices::TextureHandle*
 OptixRaytracer::get_texture_handle(ustring filename,
                                    ShadingContext* /*shading_context*/,
-                                   const TextureOpt* options)
+                                   const TextureOpt* /*options*/)
 {
     auto itr = m_samplers.find(filename);
     if (itr == m_samplers.end()) {
-        // Open image
+        // Open image to check the number of mip levels
         OIIO::ImageBuf image;
         if (!image.init_spec(filename, 0, 0)) {
             errhandler().errorfmt("Could not load: {} (hash {})", filename,
                                   filename);
             return (TextureHandle*)nullptr;
         }
-
-        OIIO::ROI roi = OIIO::get_roi_full(image.spec());
-        int32_t width = roi.width(), height = roi.height();
-        std::vector<float> pixels(width * height * 4);
-
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]);
-            }
-        }
-        cudaResourceDesc res_desc = {};
+        int32_t nmiplevels = std::max(image.nmiplevels(), 1);
+        int32_t img_width  = image.xmax() + 1;
+        int32_t img_height = image.ymax() + 1;
 
         // hard-code textures to 4 channels
-        int32_t pitch = width * 4 * sizeof(float);
         cudaChannelFormatDesc channel_desc
             = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
 
-        // TODO: Free this memory
+        cudaMipmappedArray_t mipmapArray;
+        cudaExtent extent = make_cudaExtent(img_width, img_height, 0);
+        CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent,
+                                            nmiplevels));
+
+        // Copy the pixel data for each mip level
+        std::vector<std::vector<float>> level_pixels(nmiplevels);
+        for (int32_t level = 0; level < nmiplevels; ++level) {
+            image.reset(filename, 0, level);
+            OIIO::ROI roi = OIIO::get_roi_full(image.spec());
+            if (!roi.defined()) {
+                errhandler().errorfmt(
+                    "Could not load mip level {}: {} (hash {})", level,
+                    filename, filename);
+                return (TextureHandle*)nullptr;
+            }
+
+            int32_t width = roi.width(), height = roi.height();
+            level_pixels[level].resize(width * height * 4);
+            for (int j = 0; j < height; j++) {
+                for (int i = 0; i < width; i++) {
+                    image.getpixel(i, j, 0,
+                                   &level_pixels[level][((j * width) + i) * 4]);
+                }
+            }
+
+            cudaArray_t miplevelArray;
+            CUDA_CHECK(
+                cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level));
+
+            // Copy the texel data into the miplevel array
+            int32_t pitch = width * 4 * sizeof(float);
+            CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0,
+                                           level_pixels[level].data(), pitch,
+                                           pitch, height,
+                                           cudaMemcpyHostToDevice));
+        }
+
+        int32_t pitch = img_width * 4 * sizeof(float);
         cudaArray_t pixelArray;
-        CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height));
-        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, pixels.data(), pitch,
-                                       pitch, height, cudaMemcpyHostToDevice));
-
-        res_desc.resType         = cudaResourceTypeArray;
-        res_desc.res.array.array = pixelArray;
-
-        cudaTextureDesc tex_desc = {};
-        tex_desc.addressMode[0]  = cudaAddressModeWrap;
-        tex_desc.addressMode[1]  = cudaAddressModeWrap;
-        tex_desc.filterMode      = cudaFilterModeLinear;
-        tex_desc.readMode
-            = cudaReadModeElementType;  //cudaReadModeNormalizedFloat;
+        CUDA_CHECK(
+            cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height));
+        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
+                                       pitch, pitch, img_height,
+                                       cudaMemcpyHostToDevice));
+        m_arrays_to_free.push_back(pixelArray);
+
+        cudaResourceDesc res_desc  = {};
+        res_desc.resType           = cudaResourceTypeMipmappedArray;
+        res_desc.res.mipmap.mipmap = mipmapArray;
+
+        cudaTextureDesc tex_desc     = {};
+        tex_desc.addressMode[0]      = cudaAddressModeWrap;
+        tex_desc.addressMode[1]      = cudaAddressModeWrap;
+        tex_desc.filterMode          = cudaFilterModeLinear;
+        tex_desc.readMode            = cudaReadModeElementType;
         tex_desc.normalizedCoords    = 1;
         tex_desc.maxAnisotropy       = 1;
-        tex_desc.maxMipmapLevelClamp = 99;
+        tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1);
         tex_desc.minMipmapLevelClamp = 0;
-        tex_desc.mipmapFilterMode    = cudaFilterModePoint;
+        tex_desc.mipmapFilterMode    = cudaFilterModeLinear;
         tex_desc.borderColor[0]      = 1.0f;
         tex_desc.sRGB                = 0;
 
@@ -1034,8 +1027,10 @@ OptixRaytracer::prepare_render()
 
     // Set up the OptiX scene graph
     build_accel();
+    prepare_lights();
     upload_mesh_data();
     make_optix_materials();
+    prepare_background();
 }
 
 
@@ -1054,16 +1049,18 @@ OptixRaytracer::warmup()
 void
 OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
 {
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_output_buffer),
-                          xres * yres * 4 * sizeof(float)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_launch_params),
-                          sizeof(RenderParams)));
-    device_ptrs.push_back(d_output_buffer);
-    device_ptrs.push_back(d_launch_params);
+    d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float));
+    d_launch_params = DEVICE_ALLOC(sizeof(RenderParams));
 
     m_xres = xres;
     m_yres = yres;
 
+    const int aa                  = std::max(1, options.get_int("aa"));
+    const int max_bounces         = options.get_int("max_bounces");
+    const bool no_jitter          = options.get_int("no_jitter");
+    const float show_albedo_scale = options.get_float("show_albedo_scale");
+    const int show_globals        = options.get_int("show_globals");
+
     RenderParams params;
     params.eye.x                   = camera.eye.x;
     params.eye.y                   = camera.eye.y;
@@ -1071,14 +1068,15 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
     params.dir.x                   = camera.dir.x;
     params.dir.y                   = camera.dir.y;
     params.dir.z                   = camera.dir.z;
-    params.cx.x                    = camera.cx.x;
-    params.cx.y                    = camera.cx.y;
-    params.cx.z                    = camera.cx.z;
-    params.cy.x                    = camera.cy.x;
-    params.cy.y                    = camera.cy.y;
-    params.cy.z                    = camera.cy.z;
-    params.invw                    = 1.0f / m_xres;
-    params.invh                    = 1.0f / m_yres;
+    params.up.x                    = camera.up.x;
+    params.up.y                    = camera.up.y;
+    params.up.z                    = camera.up.z;
+    params.fov                     = camera.fov;
+    params.aa                      = aa;
+    params.max_bounces             = max_bounces;
+    params.show_albedo_scale       = show_albedo_scale;
+    params.show_globals            = show_globals;
+    params.no_jitter               = no_jitter;
     params.interactive_params      = d_interactive_params;
     params.output_buffer           = d_output_buffer;
     params.traversal_handle        = m_travHandle;
@@ -1098,16 +1096,24 @@ OptixRaytracer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
     params.normal_indices  = d_normal_indices;
     params.shader_ids      = d_shader_ids;
     params.shader_is_light = d_shader_is_light;
+    params.lightprims      = d_lightprims;
+    params.lightprims_size = OptixRaytracer::lightprims().size();
     params.mesh_ids        = d_mesh_ids;
     params.surfacearea     = d_surfacearea;
 
-    CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_launch_params), &params,
-                          sizeof(RenderParams), cudaMemcpyHostToDevice));
+    // For the background shader
+    params.bg_res    = std::max<int>(32, getBackgroundResolution());
+    params.bg_id     = getBackgroundShaderID();
+    params.bg_values = d_bg_values;
+    params.bg_rows   = d_bg_rows;
+    params.bg_cols   = d_bg_cols;
+
+    COPY_TO_DEVICE(d_launch_params, &params, sizeof(RenderParams));
 
     // Set up global variables
     OPTIX_CHECK(optixLaunch(m_optix_pipeline, m_cuda_stream, d_launch_params,
-                            sizeof(RenderParams), &m_setglobals_optix_sbt, 1, 1,
-                            1));
+                            sizeof(RenderParams), &m_setglobals_optix_sbt, 32,
+                            1, 1));
     CUDA_SYNC_CHECK();
 
     // Launch real render
diff --git a/src/testrender/optixraytracer.h b/src/testrender/optixraytracer.h
index 69d0c6c5b..b15a3147d 100644
--- a/src/testrender/optixraytracer.h
+++ b/src/testrender/optixraytracer.h
@@ -45,6 +45,7 @@ class OptixRaytracer final : public SimpleRaytracer {
     void cleanup_programs();
     void build_accel();
     void upload_mesh_data();
+    void prepare_background();
     void prepare_render() override;
     void warmup() override;
     void render(int xres, int yres) override;
@@ -87,7 +88,6 @@ class OptixRaytracer final : public SimpleRaytracer {
     OptixPipelineLinkOptions m_pipeline_link_options       = {};
     OptixProgramGroupOptions m_program_options             = {};
     OptixModule m_program_module                           = {};
-    OptixModule m_wrapper_module                           = {};
     OptixModule m_rend_lib_module                          = {};
     OptixModule m_shadeops_module                          = {};
     OptixProgramGroup m_raygen_group                       = {};
@@ -115,7 +115,11 @@ class OptixRaytracer final : public SimpleRaytracer {
     CUdeviceptr d_shader_is_light     = 0;
     CUdeviceptr d_mesh_ids            = 0;
     CUdeviceptr d_surfacearea         = 0;
+    CUdeviceptr d_lightprims          = 0;
     CUdeviceptr d_interactive_params  = 0;
+    CUdeviceptr d_bg_values           = 0;
+    CUdeviceptr d_bg_rows             = 0;
+    CUdeviceptr d_bg_cols             = 0;
     CUdeviceptr d_osl_printf_buffer   = 0;
     CUdeviceptr d_color_system        = 0;
 
@@ -137,7 +141,9 @@ class OptixRaytracer final : public SimpleRaytracer {
     std::string m_materials_ptx;
     std::unordered_map<ustringhash, optix::TextureSampler> m_samplers;
 
-    std::vector<CUdeviceptr> device_ptrs;
+    // CUdeviceptrs that need to be freed after we are done
+    std::vector<CUdeviceptr> m_ptrs_to_free;
+    std::vector<cudaArray_t> m_arrays_to_free;
 };
 
 
diff --git a/src/testrender/raytracer.h b/src/testrender/raytracer.h
index cf640a8cb..ebf6c94c4 100644
--- a/src/testrender/raytracer.h
+++ b/src/testrender/raytracer.h
@@ -21,6 +21,10 @@
 #    include <vector_functions.h>  // from CUDA
 #endif
 
+#ifdef __CUDACC__
+#    include "cuda/rend_lib.h"
+#endif
+
 // The primitives don't included the intersection routines, etc., from the
 // versions in testrender, since those operations are performed on the GPU.
 //
@@ -33,7 +37,7 @@ class OptixRenderer;  // FIXME -- should not be here
 
 
 // build two vectors orthogonal to the first, assumes n is normalized
-inline void
+inline OSL_HOSTDEVICE void
 ortho(const Vec3& n, Vec3& x, Vec3& y)
 {
     x = (fabsf(n.x) > .01f ? Vec3(n.z, 0, -n.x) : Vec3(0, -n.z, n.y))
@@ -55,6 +59,7 @@ struct Ray {
         DISPLACEMENT = 128
     };
 
+    OSL_HOSTDEVICE
     Ray(const Vec3& o, const Vec3& d, float radius, float spread,
         RayType raytype)
         : origin(o)
@@ -65,7 +70,10 @@ struct Ray {
     {
     }
 
+    OSL_HOSTDEVICE
     Vec3 point(float t) const { return origin + direction * t; }
+
+    OSL_HOSTDEVICE
     Dual2<Vec3> dual_direction() const
     {
         Dual2<Vec3> v;
@@ -76,6 +84,7 @@ struct Ray {
         return v;
     }
 
+    OSL_HOSTDEVICE
     Dual2<Vec3> point(Dual2<float> t) const
     {
         const float r = radius + spread * t.val();
@@ -95,9 +104,10 @@ struct Ray {
 
 
 struct Camera {
-    Camera() {}
+    OSL_HOSTDEVICE Camera() {}
 
     // Set where the camera sits and looks at.
+    OSL_HOSTDEVICE
     void lookat(const Vec3& eye, const Vec3& dir, const Vec3& up, float fov)
     {
         this->eye = eye;
@@ -108,6 +118,7 @@ struct Camera {
     }
 
     // Set resolution
+    OSL_HOSTDEVICE
     void resolution(int w, int h)
     {
         xres = w;
@@ -118,6 +129,7 @@ struct Camera {
     }
 
     // Compute all derived values based on camera parameters.
+    OSL_HOSTDEVICE
     void finalize()
     {
         float k    = OIIO::fast_tan(fov * float(M_PI / 360));
@@ -127,10 +139,21 @@ struct Camera {
     }
 
     // Get a ray for the given screen coordinates.
+    OSL_HOSTDEVICE
     Ray get(float x, float y) const
     {
+        // TODO: On CUDA devices, the normalize() operation can result in vector
+        // components with magnitudes slightly greater than 1.0, which can cause
+        // downstream computations to blow up and produce NaNs. Normalizing the
+        // vector again avoids this issue.
         const Vec3 v = (cx * (x * invw - 0.5f) + cy * (0.5f - y * invh) + dir)
+#ifndef __CUDACC__
                            .normalize();
+#else
+                           .normalize()
+                           .normalized();
+#endif
+
         const float cos_a = dir.dot(v);
         const float spread
             = sqrtf(invw * invh * cx.length() * cy.length() * cos_a) * cos_a;
@@ -164,6 +187,7 @@ struct LightSample {
 using ShaderMap = std::unordered_map<std::string, int>;
 
 struct Scene {
+#ifndef __CUDACC__
     void add_sphere(const Vec3& c, float r, int shaderID, int resolution);
 
     void add_quad(const Vec3& p, const Vec3& ex, const Vec3& ey, int shaderID,
@@ -176,11 +200,16 @@ struct Scene {
     int num_prims() const { return triangles.size(); }
 
     void prepare(OIIO::ErrorHandler& errhandler);
+#endif
 
+    // NB: OptiX needs to populate the ShaderGlobals in the closest-hit program,
+    //     so we need to pass along a pointer to the struct.
+    OSL_HOSTDEVICE
     Intersection intersect(const Ray& r, const float tmax,
                            const unsigned skipID1,
                            const unsigned skipID2 = ~0u) const;
 
+    OSL_HOSTDEVICE
     LightSample sample(int primID, const Vec3& x, float xi, float yi) const
     {
         // A Low-Distortion Map Between Triangle and Square
@@ -205,6 +234,7 @@ struct Scene {
         return { dir, sqrtf(d2), pdf, xi, yi };
     }
 
+    OSL_HOSTDEVICE
     float shapepdf(int primID, const Vec3& x, const Vec3& p) const
     {
         const Vec3 va = verts[triangles[primID].a];
@@ -219,6 +249,7 @@ struct Scene {
         return d2 / (0.5f * fabsf(dir.dot(n)));
     }
 
+    OSL_HOSTDEVICE
     float primitivearea(int primID) const
     {
         const Vec3 va = verts[triangles[primID].a];
@@ -227,6 +258,7 @@ struct Scene {
         return 0.5f * (va - vb).cross(va - vc).length();
     }
 
+    OSL_HOSTDEVICE
     Vec3 normal(const Dual2<Vec3>& p, Vec3& Ng, int primID, float u,
                 float v) const
     {
@@ -246,6 +278,7 @@ struct Scene {
         return ((1 - u - v) * na + u * nb + v * nc).normalize();
     }
 
+    OSL_HOSTDEVICE
     Dual2<Vec2> uv(const Dual2<Vec3>& p, const Vec3& n, Vec3& dPdu, Vec3& dPdv,
                    int primID, float u, float v) const
     {
@@ -273,8 +306,9 @@ struct Scene {
         return Dual2<Vec2>((1 - u - v) * ta + u * tb + v * tc);
     }
 
-    int shaderid(int primID) const { return shaderids[primID]; }
+    OSL_HOSTDEVICE int shaderid(int primID) const { return shaderids[primID]; }
 
+#ifndef __CUDACC__
     // basic triangle data
     std::vector<Vec3> verts;
     std::vector<Vec3> normals;
@@ -287,6 +321,16 @@ struct Scene {
         last_index;  // one entry per mesh, stores the last triangle index (+1) -- also is the start triangle of the next mesh
     // acceleration structure (built over triangles)
     std::unique_ptr<BVH> bvh;
+#else
+    const Vec3* verts;
+    const Vec3* normals;
+    const Vec2* uvs;
+    const TriangleIndices* triangles;
+    const TriangleIndices* uv_triangles;
+    const TriangleIndices* n_triangles;
+    const int* shaderids;
+    OptixTraversableHandle handle;
+#endif
 };
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/render_params.h b/src/testrender/render_params.h
index 5d89c83f9..b13059a6c 100644
--- a/src/testrender/render_params.h
+++ b/src/testrender/render_params.h
@@ -15,11 +15,13 @@ struct RenderParams {
 
     float3 eye;
     float3 dir;
-    float3 cx;
-    float3 cy;
-
-    float invw;
-    float invh;
+    float3 up;
+    float fov;
+    int aa;
+    int max_bounces;
+    float show_albedo_scale;
+    bool no_jitter;
+    int show_globals;
 
     CUdeviceptr traversal_handle;
     CUdeviceptr output_buffer;
@@ -50,6 +52,15 @@ struct RenderParams {
     CUdeviceptr shader_is_light;
     CUdeviceptr mesh_ids;
     CUdeviceptr surfacearea;
+    CUdeviceptr lightprims;
+    size_t lightprims_size;
+
+    // for the background
+    int bg_res;
+    int bg_id;
+    CUdeviceptr bg_values;
+    CUdeviceptr bg_rows;
+    CUdeviceptr bg_cols;
 };
 
 
diff --git a/src/testrender/sampling.h b/src/testrender/sampling.h
index e69f3f5f1..a8484617f 100644
--- a/src/testrender/sampling.h
+++ b/src/testrender/sampling.h
@@ -14,7 +14,7 @@ OSL_NAMESPACE_ENTER
 
 struct TangentFrame {
     // build frame from unit normal
-    static TangentFrame from_normal(const Vec3& n)
+    static OSL_HOSTDEVICE TangentFrame from_normal(const Vec3& n)
     {
         // https://graphics.pixar.com/library/OrthonormalB/paper.pdf
         const float sign = copysignf(1.0f, n.z);
@@ -27,7 +27,8 @@ struct TangentFrame {
 
     // build frame from unit normal and unit tangent
     // fallsback to an arbitrary basis if the tangent is 0 or colinear with n
-    static TangentFrame from_normal_and_tangent(const Vec3& n, const Vec3& t)
+    static OSL_HOSTDEVICE TangentFrame from_normal_and_tangent(const Vec3& n,
+                                                               const Vec3& t)
     {
         Vec3 x      = t - n * dot(n, t);
         float xlen2 = dot(x, x);
@@ -41,18 +42,24 @@ struct TangentFrame {
     }
 
     // transform vector
-    Vec3 get(float x, float y, float z) const { return x * u + y * v + z * w; }
+    Vec3 OSL_HOSTDEVICE get(float x, float y, float z) const
+    {
+        return x * u + y * v + z * w;
+    }
 
     // untransform vector
-    float getx(const Vec3& a) const { return a.dot(u); }
-    float gety(const Vec3& a) const { return a.dot(v); }
-    float getz(const Vec3& a) const { return a.dot(w); }
+    float OSL_HOSTDEVICE getx(const Vec3& a) const { return a.dot(u); }
+    float OSL_HOSTDEVICE gety(const Vec3& a) const { return a.dot(v); }
+    float OSL_HOSTDEVICE getz(const Vec3& a) const { return a.dot(w); }
 
-    Vec3 tolocal(const Vec3& a) const
+    Vec3 OSL_HOSTDEVICE tolocal(const Vec3& a) const
     {
         return Vec3(a.dot(u), a.dot(v), a.dot(w));
     }
-    Vec3 toworld(const Vec3& a) const { return get(a.x, a.y, a.z); }
+    Vec3 OSL_HOSTDEVICE toworld(const Vec3& a) const
+    {
+        return get(a.x, a.y, a.z);
+    }
 
     Vec3 u, v, w;
 };
@@ -60,7 +67,7 @@ struct TangentFrame {
 struct Sampling {
     /// Warp the unit disk onto the unit sphere
     /// http://psgraphics.blogspot.com/2011/01/improved-code-for-concentric-map.html
-    static void to_unit_disk(float& x, float& y)
+    static OSL_HOSTDEVICE void to_unit_disk(float& x, float& y)
     {
         const float PI_OVER_4 = float(M_PI_4);
         const float PI_OVER_2 = float(M_PI_2);
@@ -82,8 +89,9 @@ struct Sampling {
         y *= r;
     }
 
-    static void sample_cosine_hemisphere(const Vec3& N, float rndx, float rndy,
-                                         Vec3& out, float& pdf)
+    static OSL_HOSTDEVICE void sample_cosine_hemisphere(const Vec3& N,
+                                                        float rndx, float rndy,
+                                                        Vec3& out, float& pdf)
     {
         to_unit_disk(rndx, rndy);
         float cos_theta = sqrtf(std::max(1 - rndx * rndx - rndy * rndy, 0.0f));
@@ -91,8 +99,9 @@ struct Sampling {
         pdf = cos_theta * float(M_1_PI);
     }
 
-    static void sample_uniform_hemisphere(const Vec3& N, float rndx, float rndy,
-                                          Vec3& out, float& pdf)
+    static OSL_HOSTDEVICE void sample_uniform_hemisphere(const Vec3& N,
+                                                         float rndx, float rndy,
+                                                         Vec3& out, float& pdf)
     {
         float phi       = float(2 * M_PI) * rndx;
         float cos_theta = rndy;
@@ -118,7 +127,8 @@ struct MIS {
     // Centralizing the handling of the pdfs this way ensures that all numerical
     // cases can be enumerated and handled robustly without arbitrary epsilons.
     template<MISMode mode>
-    static inline float power_heuristic(float sampled_pdf, float other_pdf)
+    static inline OSL_HOSTDEVICE float power_heuristic(float sampled_pdf,
+                                                       float other_pdf)
     {
         // NOTE: inf is ok!
         assert(sampled_pdf >= 0);
@@ -159,9 +169,16 @@ struct MIS {
     // such as a BRDF mixture. This updates a (weight, pdf) pair with a new one
     // to represent the sum of both. b is the probability of choosing the provided
     // weight. A running sum should be started with a weight and pdf of 0.
-    static inline void update_eval(Color3* w, float* pdf, Color3 ow, float opdf,
-                                   float b)
+    static inline OSL_HOSTDEVICE void
+    update_eval(Color3* w, float* pdf, Color3 ow, float opdf, float b)
     {
+#ifdef __CUDACC__
+        // Check for those pesky NaNs
+        assert(*pdf == *pdf);
+        assert(b == b);
+        assert(opdf == opdf);
+#endif
+
         // NOTE: inf is ok!
         assert(*pdf >= 0);
         assert(opdf >= 0);
@@ -193,6 +210,7 @@ struct MIS {
 // "Practical Hash-based Owen Scrambling" - Brent Burley - JCGT 2020
 //    https://jcgt.org/published/0009/04/01/
 struct Sampler {
+    OSL_HOSTDEVICE
     Sampler(int px, int py, int si)
         : seed(((px & 2047) << 22) | ((py & 2047) << 11))
         , index(reversebits(si))
@@ -200,6 +218,7 @@ struct Sampler {
         assert(si < (1 << 24));
     }
 
+    OSL_HOSTDEVICE
     Vec3 get()
     {
         static const uint32_t zmatrix[24] = {
@@ -234,7 +253,7 @@ struct Sampler {
 private:
     uint32_t seed, index;
 
-    static uint32_t hash(uint32_t s)
+    static OSL_HOSTDEVICE uint32_t hash(uint32_t s)
     {
         // https://github.com/skeeto/hash-prospector
         s ^= s >> 16;
@@ -245,7 +264,7 @@ struct Sampler {
         return s;
     }
 
-    static uint32_t reversebits(uint32_t x)
+    static OSL_HOSTDEVICE uint32_t reversebits(uint32_t x)
     {
 #if defined(__clang__)
         return __builtin_bitreverse32(x);
@@ -259,7 +278,7 @@ struct Sampler {
 #endif
     }
 
-    static uint32_t owen_scramble(uint32_t p, uint32_t s)
+    static OSL_HOSTDEVICE uint32_t owen_scramble(uint32_t p, uint32_t s)
     {
         // https://psychopath.io/post/2021_01_30_building_a_better_lk_hash
         // assumes reversed input
diff --git a/src/testrender/shading.cpp b/src/testrender/shading.cpp
index 78bd2004e..641e5abea 100644
--- a/src/testrender/shading.cpp
+++ b/src/testrender/shading.cpp
@@ -10,243 +10,39 @@
 
 using namespace OSL;
 
+
+#ifndef __CUDACC__
+using ShaderGlobalsType = OSL::ShaderGlobals;
+#else
+using ShaderGlobalsType = OSL_CUDA::ShaderGlobals;
+#endif
+
+
 namespace {  // anonymous namespace
 
 using OIIO::clamp;
+using OSL::dot;
 
-Color3
+OSL_HOSTDEVICE Color3
 clamp(const Color3& c, float min, float max)
 {
     return Color3(clamp(c.x, min, max), clamp(c.y, min, max),
                   clamp(c.z, min, max));
 }
 
-bool
+OSL_HOSTDEVICE bool
 is_black(const Color3& c)
 {
     return c.x == 0 && c.y == 0 && c.z == 0;
 }
 
-
-
-// unique identifier for each closure supported by testrender
-enum ClosureIDs {
-    EMISSION_ID = 1,
-    BACKGROUND_ID,
-    DIFFUSE_ID,
-    OREN_NAYAR_ID,
-    TRANSLUCENT_ID,
-    PHONG_ID,
-    WARD_ID,
-    MICROFACET_ID,
-    REFLECTION_ID,
-    FRESNEL_REFLECTION_ID,
-    REFRACTION_ID,
-    TRANSPARENT_ID,
-    // See MATERIALX_CLOSURES in stdosl.h
-    MX_OREN_NAYAR_DIFFUSE_ID,
-    MX_BURLEY_DIFFUSE_ID,
-    MX_DIELECTRIC_ID,
-    MX_CONDUCTOR_ID,
-    MX_GENERALIZED_SCHLICK_ID,
-    MX_TRANSLUCENT_ID,
-    MX_TRANSPARENT_ID,
-    MX_SUBSURFACE_ID,
-    MX_SHEEN_ID,
-    MX_UNIFORM_EDF_ID,
-    MX_ANISOTROPIC_VDF_ID,
-    MX_MEDIUM_VDF_ID,
-    MX_LAYER_ID,
-    // TODO: adding vdfs would require extending testrender with volume support ...
-};
-
-// these structures hold the parameters of each closure type
-// they will be contained inside ClosureComponent
-struct EmptyParams {};
-struct DiffuseParams {
-    Vec3 N;
-};
-struct OrenNayarParams {
-    Vec3 N;
-    float sigma;
-};
-struct PhongParams {
-    Vec3 N;
-    float exponent;
-};
-struct WardParams {
-    Vec3 N, T;
-    float ax, ay;
-};
-struct ReflectionParams {
-    Vec3 N;
-    float eta;
-};
-struct RefractionParams {
-    Vec3 N;
-    float eta;
-};
-struct MicrofacetParams {
-    ustringhash dist;
-    Vec3 N, U;
-    float xalpha, yalpha, eta;
-    int refract;
-};
-
-// MATERIALX_CLOSURES
-
-struct MxOrenNayarDiffuseParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    ustringhash label;
-    int energy_compensation;
-};
-
-struct MxBurleyDiffuseParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    ustringhash label;
-};
-
-// common to all MaterialX microfacet closures
-struct MxMicrofacetBaseParams {
-    Vec3 N, U;
-    float roughness_x;
-    float roughness_y;
-    ustringhash distribution;
-    // optional
-    ustringhash label;
-};
-
-struct MxDielectricParams : public MxMicrofacetBaseParams {
-    Color3 reflection_tint;
-    Color3 transmission_tint;
-    float ior;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return reflection_tint * fresnel_dielectric(cos_theta, ior);
-    }
-
-    Color3 evalT(float cos_theta) const
-    {
-        return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior));
-    }
-};
-
-struct MxConductorParams : public MxMicrofacetBaseParams {
-    Color3 ior;
-    Color3 extinction;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return fresnel_conductor(cos_theta, ior, extinction);
-    }
-
-    Color3 evalT(float cos_theta) const { return Color3(0.0f); }
-
-    // Avoid function was declared but never referenced
-    // float get_ior() const
-    // {
-    //     return 0;  // no transmission possible
-    // }
-};
-
-struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams {
-    Color3 reflection_tint;
-    Color3 transmission_tint;
-    Color3 f0;
-    Color3 f90;
-    float exponent;
-    // optional
-    float thinfilm_thickness;
-    float thinfilm_ior;
-
-    Color3 evalR(float cos_theta) const
-    {
-        return reflection_tint
-               * fresnel_generalized_schlick(cos_theta, f0, f90, exponent);
-    }
-
-    Color3 evalT(float cos_theta) const
-    {
-        return transmission_tint
-               * (Color3(1.0f)
-                  - fresnel_generalized_schlick(cos_theta, f0, f90, exponent));
-    }
-};
-
-struct MxTranslucentParams {
-    Vec3 N;
-    Color3 albedo;
-    // optional
-    ustringhash label;
-};
-
-struct MxSubsurfaceParams {
-    Vec3 N;
-    Color3 albedo;
-    Color3 radius;
-    float anisotropy;
-    // optional
-    ustringhash label;
-};
-
-struct MxSheenParams {
-    Vec3 N;
-    Color3 albedo;
-    float roughness;
-    // optional
-    int mode;
-    ustringhash label;
-};
-
-struct MxUniformEdfParams {
-    Color3 emittance;
-    // optional
-    ustringhash label;
-};
-
-struct MxLayerParams {
-    OSL::ClosureColor* top;
-    OSL::ClosureColor* base;
-};
-
-struct MxAnisotropicVdfParams {
-    Color3 albedo;
-    Color3 extinction;
-    float anisotropy;
-    // optional
-    ustringhash label;
-};
-
-struct MxMediumVdfParams {
-    Color3 albedo;
-    float transmission_depth;
-    Color3 transmission_color;
-    float anisotropy;
-    float ior;
-    int priority;
-    // optional
-    ustringhash label;
-};
-
 }  // anonymous namespace
 
 
 OSL_NAMESPACE_ENTER
 
 
+#ifndef __CUDACC__
 void
 register_closures(OSL::ShadingSystem* shadingsys)
 {
@@ -433,24 +229,26 @@ register_closures(OSL::ShadingSystem* shadingsys)
     for (const BuiltinClosures& b : builtins)
         shadingsys->register_closure(b.name, b.id, b.params, nullptr, nullptr);
 }
+#endif  // ifndef __CUDACC__
 
 OSL_NAMESPACE_EXIT
 
 namespace {  // anonymous namespace
 
 template<int trans> struct Diffuse final : public BSDF, DiffuseParams {
-    Diffuse(const DiffuseParams& params) : BSDF(), DiffuseParams(params)
+    OSL_HOSTDEVICE Diffuse(const DiffuseParams& params)
+        : BSDF(DIFFUSE_ID), DiffuseParams(params)
     {
         if (trans)
             N = -N;
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const OSL::Vec3& wi) const
     {
         const float pdf = std::max(N.dot(wi), 0.0f) * float(M_1_PI);
         return { wi, Color3(1.0f), pdf, 1.0f };
     }
-    Sample sample(const Vec3& /*wo*/, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& /*wo*/, float rx, float ry,
+                                 float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -460,10 +258,11 @@ template<int trans> struct Diffuse final : public BSDF, DiffuseParams {
 };
 
 struct OrenNayar final : public BSDF, OrenNayarParams {
-    OrenNayar(const OrenNayarParams& params) : BSDF(), OrenNayarParams(params)
+    OSL_HOSTDEVICE OrenNayar(const OrenNayarParams& params)
+        : BSDF(OREN_NAYAR_ID), OrenNayarParams(params)
     {
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float NL = N.dot(wi);
         float NV = N.dot(wo);
@@ -485,8 +284,8 @@ struct OrenNayar final : public BSDF, OrenNayarParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -496,11 +295,12 @@ struct OrenNayar final : public BSDF, OrenNayarParams {
 };
 
 struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
+    OSL_HOSTDEVICE
     EnergyCompensatedOrenNayar(const MxOrenNayarDiffuseParams& params)
-        : BSDF(), MxOrenNayarDiffuseParams(params)
+        : BSDF(MX_OREN_NAYAR_DIFFUSE_ID), MxOrenNayarDiffuseParams(params)
     {
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float NL = N.dot(wi);
         float NV = N.dot(wo);
@@ -537,8 +337,8 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
         return {};
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         Vec3 out_dir;
         float pdf;
@@ -551,7 +351,7 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
     static constexpr float constant2_FON = float(2.0 / 3.0
                                                  - 28.0 / (15.0 * M_PI));
 
-    float E_FON_analytic(float mu) const
+    OSL_HOSTDEVICE float E_FON_analytic(float mu) const
     {
         const float sigma = roughness;
         float AF          = 1.0f
@@ -567,8 +367,11 @@ struct EnergyCompensatedOrenNayar : public BSDF, MxOrenNayarDiffuseParams {
 };
 
 struct Phong final : public BSDF, PhongParams {
-    Phong(const PhongParams& params) : BSDF(), PhongParams(params) {}
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Phong(const PhongParams& params)
+        : BSDF(PHONG_ID), PhongParams(params)
+    {
+    }
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         float cosNI = N.dot(wi);
         float cosNO = N.dot(wo);
@@ -585,8 +388,8 @@ struct Phong final : public BSDF, PhongParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0) {
@@ -607,8 +410,11 @@ struct Phong final : public BSDF, PhongParams {
 };
 
 struct Ward final : public BSDF, WardParams {
-    Ward(const WardParams& params) : BSDF(), WardParams(params) {}
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Ward(const WardParams& params)
+        : BSDF(WARD_ID), WardParams(params)
+    {
+    }
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         float cosNO = N.dot(wo);
         float cosNI = N.dot(wi);
@@ -632,8 +438,8 @@ struct Ward final : public BSDF, WardParams {
         }
         return {};
     }
-    Sample sample(const Vec3& wo, float rx, float ry,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float /*rz*/) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0) {
@@ -710,17 +516,18 @@ struct Ward final : public BSDF, WardParams {
  * is sufficient).
  */
 struct GGXDist {
-    static float F(const float tan_m2)
+    static OSL_HOSTDEVICE float F(const float tan_m2)
     {
         return 1 / (float(M_PI) * (1 + tan_m2) * (1 + tan_m2));
     }
 
-    static float Lambda(const float a2)
+    static OSL_HOSTDEVICE float Lambda(const float a2)
     {
         return 0.5f * (-1.0f + sqrtf(1.0f + 1.0f / a2));
     }
 
-    static Vec2 sampleSlope(float cos_theta, float randu, float randv)
+    static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu,
+                                           float randv)
     {
         // GGX
         Vec2 slope;
@@ -747,12 +554,12 @@ struct GGXDist {
 };
 
 struct BeckmannDist {
-    static float F(const float tan_m2)
+    static OSL_HOSTDEVICE float F(const float tan_m2)
     {
         return float(1 / M_PI) * OIIO::fast_exp(-tan_m2);
     }
 
-    static float Lambda(const float a2)
+    static OSL_HOSTDEVICE float Lambda(const float a2)
     {
         const float a = sqrtf(a2);
         return a < 1.6f ? (1.0f - 1.259f * a + 0.396f * a2)
@@ -760,7 +567,8 @@ struct BeckmannDist {
                         : 0.0f;
     }
 
-    static Vec2 sampleSlope(float cos_theta, float randu, float randv)
+    static OSL_HOSTDEVICE Vec2 sampleSlope(float cos_theta, float randu,
+                                           float randv)
     {
         const float SQRT_PI_INV = 1 / sqrtf(float(M_PI));
         float ct                = cos_theta < 1e-6f ? 1e-6f : cos_theta;
@@ -808,13 +616,13 @@ struct BeckmannDist {
 
 template<typename Distribution, int Refract>
 struct Microfacet final : public BSDF, MicrofacetParams {
-    Microfacet(const MicrofacetParams& params)
-        : BSDF()
+    OSL_HOSTDEVICE Microfacet(const MicrofacetParams& params)
+        : BSDF(MICROFACET_ID)
         , MicrofacetParams(params)
         , tf(TangentFrame::from_normal_and_tangent(N, U))
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         if (Refract == 2)
             return Color3(1.0f);
@@ -823,7 +631,7 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         float fr = fresnel_dielectric(N.dot(wo), eta);
         return Color3(Refract ? 1 - fr : fr);
     }
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         const Vec3 wo_l = tf.tolocal(wo);
         const Vec3 wi_l = tf.tolocal(wi);
@@ -890,7 +698,8 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         return {};
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
     {
         const Vec3 wo_l   = tf.tolocal(wo);
         const float cosNO = wo_l.z;
@@ -948,9 +757,9 @@ struct Microfacet final : public BSDF, MicrofacetParams {
     }
 
 private:
-    static float SQR(float x) { return x * x; }
+    static OSL_HOSTDEVICE float SQR(float x) { return x * x; }
 
-    float evalLambda(const Vec3 w) const
+    OSL_HOSTDEVICE float evalLambda(const Vec3 w) const
     {
         float cosTheta2 = SQR(w.z);
         /* Have these two multiplied by sinTheta^2 for convenience */
@@ -959,15 +768,18 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2));
     }
 
-    static float evalG2(float Lambda_i, float Lambda_o)
+    static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o)
     {
         // correlated masking-shadowing
         return 1 / (Lambda_i + Lambda_o + 1);
     }
 
-    static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
+    static OSL_HOSTDEVICE float evalG1(float Lambda_v)
+    {
+        return 1 / (Lambda_v + 1);
+    }
 
-    float evalD(const Vec3 Hr) const
+    OSL_HOSTDEVICE float evalD(const Vec3 Hr) const
     {
         float cosThetaM = Hr.z;
         if (cosThetaM > 0) {
@@ -979,12 +791,20 @@ struct Microfacet final : public BSDF, MicrofacetParams {
 
             float tanThetaM2 = (cosPhi2st2 + sinPhi2st2) / cosThetaM2;
 
-            return Distribution::F(tanThetaM2) / (xalpha * yalpha * cosThetaM4);
+            const float val = Distribution::F(tanThetaM2)
+                              / (xalpha * yalpha * cosThetaM4);
+#ifndef __CUDACC__
+            return val;
+#else
+            // Filter out NaNs that can be produced when cosThetaM is very small.
+            return (val == val) ? val : 0;
+#endif
         }
         return 0;
     }
 
-    Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
+    OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu,
+                                          float randv) const
     {
         /* Project wo and stretch by alpha values */
         Vec3 swo = wo;
@@ -992,6 +812,10 @@ struct Microfacet final : public BSDF, MicrofacetParams {
         swo.y *= yalpha;
         swo = swo.normalize();
 
+#ifdef __CUDACC__
+        swo = swo.normalize();
+#endif
+
         // figure out angles for the incoming vector
         float cos_theta = std::max(swo.z, 0.0f);
         float cos_phi   = 1;
@@ -1029,11 +853,12 @@ typedef Microfacet<BeckmannDist, 2> MicrofacetBeckmannBoth;
 
 
 // We use the CRTP to inherit the parameters because each MaterialX closure uses a different set of parameters
-template<typename MxMicrofacetParams, typename Distribution,
+template<typename MxMicrofacetParams, typename Distribution, ClosureIDs ID,
          bool EnableTransmissionLobe>
 struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
-    MxMicrofacet(const MxMicrofacetParams& params, float refraction_ior)
-        : BSDF()
+    OSL_HOSTDEVICE MxMicrofacet(const MxMicrofacetParams& params,
+                                float refraction_ior)
+        : BSDF(ID)
         , MxMicrofacetParams(params)
         , tf(TangentFrame::from_normal_and_tangent(MxMicrofacetParams::N,
                                                    MxMicrofacetParams::U))
@@ -1041,7 +866,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     {
     }
 
-    float get_fresnel_angle(float cos_theta) const
+    OSL_HOSTDEVICE float get_fresnel_angle(float cos_theta) const
     {
         if (EnableTransmissionLobe && refraction_ior < 1) {
             // handle TIR if we are on the backside
@@ -1055,7 +880,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
         return cos_theta;
     }
 
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         // if transmission is enabled, punt on
         if (EnableTransmissionLobe)
@@ -1067,7 +892,7 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
             get_fresnel_angle(MxMicrofacetParams::N.dot(wo)));
     }
 
-    Sample eval(const Vec3& wo, const OSL::Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const OSL::Vec3& wi) const
     {
         const Vec3 wo_l = tf.tolocal(wo);
         const Vec3 wi_l = tf.tolocal(wi);
@@ -1146,7 +971,8 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     }
 
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
     {
         const Vec3 wo_l   = tf.tolocal(wo);
         const float cosNO = wo_l.z;
@@ -1221,9 +1047,9 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
     }
 
 private:
-    static float SQR(float x) { return x * x; }
+    static OSL_HOSTDEVICE float SQR(float x) { return x * x; }
 
-    float evalLambda(const Vec3 w) const
+    OSL_HOSTDEVICE float evalLambda(const Vec3 w) const
     {
         float cosTheta2 = SQR(w.z);
         /* Have these two multiplied by sinTheta^2 for convenience */
@@ -1232,15 +1058,18 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
         return Distribution::Lambda(cosTheta2 / (cosPhi2st2 + sinPhi2st2));
     }
 
-    static float evalG2(float Lambda_i, float Lambda_o)
+    static OSL_HOSTDEVICE float evalG2(float Lambda_i, float Lambda_o)
     {
         // correlated masking-shadowing
         return 1 / (Lambda_i + Lambda_o + 1);
     }
 
-    static float evalG1(float Lambda_v) { return 1 / (Lambda_v + 1); }
+    static OSL_HOSTDEVICE float evalG1(float Lambda_v)
+    {
+        return 1 / (Lambda_v + 1);
+    }
 
-    float evalD(const Vec3 Hr) const
+    OSL_HOSTDEVICE float evalD(const Vec3 Hr) const
     {
         float cosThetaM = Hr.z;
         if (cosThetaM > 0) {
@@ -1252,14 +1081,22 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
 
             float tanThetaM2 = (cosPhi2st2 + sinPhi2st2) / cosThetaM2;
 
-            return Distribution::F(tanThetaM2)
-                   / (MxMicrofacetParams::roughness_x
-                      * MxMicrofacetParams::roughness_y * cosThetaM4);
+            const float val = Distribution::F(tanThetaM2)
+                              / (MxMicrofacetParams::roughness_x
+                                 * MxMicrofacetParams::roughness_y
+                                 * cosThetaM4);
+#ifndef __CUDACC__
+            return val;
+#else
+            // Filter out NaNs that can be produced when cosThetaM is very small.
+            return (val == val) ? val : 0.0;
+#endif
         }
         return 0;
     }
 
-    Vec3 sampleMicronormal(const Vec3 wo, float randu, float randv) const
+    OSL_HOSTDEVICE Vec3 sampleMicronormal(const Vec3 wo, float randu,
+                                          float randv) const
     {
         /* Project wo and stretch by alpha values */
         Vec3 swo = wo;
@@ -1297,23 +1134,24 @@ struct MxMicrofacet final : public BSDF, MxMicrofacetParams {
 };
 
 struct Reflection final : public BSDF, ReflectionParams {
-    Reflection(const ReflectionParams& params)
-        : BSDF(), ReflectionParams(params)
+    OSL_HOSTDEVICE Reflection(const ReflectionParams& params)
+        : BSDF(REFLECTION_ID), ReflectionParams(params)
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         float cosNO = N.dot(wo);
         if (cosNO > 0)
             return Color3(fresnel_dielectric(cosNO, eta));
         return Color3(1);
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/,
+                               const OSL::Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                                 float /*rz*/) const
     {
         // only one direction is possible
         float cosNO = dot(N, wo);
@@ -1327,21 +1165,22 @@ struct Reflection final : public BSDF, ReflectionParams {
 };
 
 struct Refraction final : public BSDF, RefractionParams {
-    Refraction(const RefractionParams& params)
-        : BSDF(), RefractionParams(params)
+    OSL_HOSTDEVICE Refraction(const RefractionParams& params)
+        : BSDF(REFRACTION_ID), RefractionParams(params)
     {
     }
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         float cosNO = N.dot(wo);
         return Color3(1 - fresnel_dielectric(cosNO, eta));
     }
-    Sample eval(const Vec3& /*wo*/, const OSL::Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/,
+                               const OSL::Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                                 float /*rz*/) const
     {
         float pdf = std::numeric_limits<float>::infinity();
         Vec3 wi;
@@ -1351,13 +1190,13 @@ struct Refraction final : public BSDF, RefractionParams {
 };
 
 struct Transparent final : public BSDF {
-    Transparent() : BSDF() {}
-    Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const override
+    OSL_HOSTDEVICE Transparent() : BSDF(TRANSPARENT_ID) {}
+    OSL_HOSTDEVICE Sample eval(const Vec3& /*wo*/, const Vec3& /*wi*/) const
     {
         return {};
     }
-    Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
-                  float /*rz*/) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float /*rx*/, float /*ry*/,
+                                 float /*rz*/) const
     {
         Vec3 wi   = -wo;
         float pdf = std::numeric_limits<float>::infinity();
@@ -1366,14 +1205,14 @@ struct Transparent final : public BSDF {
 };
 
 struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
-    MxBurleyDiffuse(const MxBurleyDiffuseParams& params)
-        : BSDF(), MxBurleyDiffuseParams(params)
+    OSL_HOSTDEVICE MxBurleyDiffuse(const MxBurleyDiffuseParams& params)
+        : BSDF(MX_BURLEY_DIFFUSE_ID), MxBurleyDiffuseParams(params)
     {
     }
 
-    Color3 get_albedo(const Vec3& wo) const override { return albedo; }
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const { return albedo; }
 
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         const Vec3 L = wi, V = wo;
         const Vec3 H = (L + V).normalize();
@@ -1387,7 +1226,8 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
         return { wi, albedo * refL * refV, pdf, 1.0f };
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
     {
         Vec3 out_dir;
         float pdf;
@@ -1401,9 +1241,12 @@ struct MxBurleyDiffuse final : public BSDF, MxBurleyDiffuseParams {
 // To simplify the implementation, the simpler shadowing/masking visibility term below is used:
 // https://dassaultsystemes-technology.github.io/EnterprisePBRShadingModel/spec-2022x.md.html#components/sheen
 struct CharlieSheen final : public BSDF, MxSheenParams {
-    CharlieSheen(const MxSheenParams& params) : BSDF(), MxSheenParams(params) {}
+    OSL_HOSTDEVICE CharlieSheen(const MxSheenParams& params)
+        : BSDF(MX_SHEEN_ID), MxSheenParams(params)
+    {
+    }
 
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         const float NdotV = clamp(N.dot(wo), 0.0f, 1.0f);
         // Rational fit from the Material X project
@@ -1417,7 +1260,7 @@ struct CharlieSheen final : public BSDF, MxSheenParams {
         return clamp(albedo * (r.x / r.y), 0.0f, 1.0f);
     }
 
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         const Vec3 L = wi, V = wo;
         const Vec3 H       = (L + V).normalize();
@@ -1437,7 +1280,8 @@ struct CharlieSheen final : public BSDF, MxSheenParams {
                  pdf, 1.0f };
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
     {
         Vec3 out_dir;
         float pdf;
@@ -1451,20 +1295,20 @@ struct CharlieSheen final : public BSDF, MxSheenParams {
 // Tizian Zeltner, Brent Burley, Matt Jen-Yuan Chiang - Siggraph 2022
 // https://tizianzeltner.com/projects/Zeltner2022Practical/
 struct ZeltnerBurleySheen final : public BSDF, MxSheenParams {
-    ZeltnerBurleySheen(const MxSheenParams& params)
-        : BSDF(), MxSheenParams(params)
+    OSL_HOSTDEVICE ZeltnerBurleySheen(const MxSheenParams& params)
+        : BSDF(MX_SHEEN_ID), MxSheenParams(params)
     {
     }
 
 #define USE_LTC_SAMPLING 1
 
-    Color3 get_albedo(const Vec3& wo) const override
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         const float NdotV = clamp(N.dot(wo), 1e-5f, 1.0f);
         return Color3(fetch_ltc(NdotV).z);
     }
 
-    Sample eval(const Vec3& wo, const Vec3& wi) const override
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         const Vec3 L = wi, V = wo;
         const float NdotV = clamp(N.dot(V), 0.0f, 1.0f);
@@ -1492,7 +1336,8 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams {
 #endif
     }
 
-    Sample sample(const Vec3& wo, float rx, float ry, float rz) const override
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
     {
 #if USE_LTC_SAMPLING == 1
         const Vec3 V      = wo;
@@ -1523,7 +1368,7 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams {
     }
 
 private:
-    Vec3 fetch_ltc(float NdotV) const
+    OSL_HOSTDEVICE Vec3 fetch_ltc(float NdotV) const
     {
         // To avoid look-up tables, we use a fit of the LTC coefficients derived by Stephen Hill
         // for the implementation in MaterialX:
@@ -1554,153 +1399,208 @@ struct ZeltnerBurleySheen final : public BSDF, MxSheenParams {
 };
 
 
-Color3
-evaluate_layer_opacity(const OSL::ShaderGlobals& sg,
-                       const ClosureColor* closure)
+OSL_HOSTDEVICE Color3
+evaluate_layer_opacity(const ShaderGlobalsType& sg, const ClosureColor* closure)
 {
     // Null closure, the layer is fully transparent
     if (closure == nullptr)
         return Color3(0);
 
-    switch (closure->id) {
-    case ClosureColor::MUL:
-        return closure->as_mul()->weight
-               * evaluate_layer_opacity(sg, closure->as_mul()->closure);
-    case ClosureColor::ADD:
-        return evaluate_layer_opacity(sg, closure->as_add()->closureA)
-               + evaluate_layer_opacity(sg, closure->as_add()->closureB);
-    default: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 w                     = comp->w;
-        switch (comp->id) {
-        case MX_LAYER_ID: {
-            const MxLayerParams* srcparams = comp->as<MxLayerParams>();
-            return w
-                   * (evaluate_layer_opacity(sg, srcparams->top)
-                      + evaluate_layer_opacity(sg, srcparams->base));
-        }
-        case REFLECTION_ID:
-        case FRESNEL_REFLECTION_ID: {
-            Reflection bsdf(*comp->as<ReflectionParams>());
-            return w * bsdf.get_albedo(-sg.I);
-        }
-        case MX_DIELECTRIC_ID: {
-            const MxDielectricParams& params = *comp->as<MxDielectricParams>();
-            // Transmissive dielectrics are opaque
-            if (!is_black(params.transmission_tint))
-                return Color3(1);
-            MxMicrofacet<MxDielectricParams, GGXDist, false> mf(params, 1.0f);
-            return w * mf.get_albedo(-sg.I);
-        }
-        case MX_GENERALIZED_SCHLICK_ID: {
-            const MxGeneralizedSchlickParams& params
-                = *comp->as<MxGeneralizedSchlickParams>();
-            // Transmissive dielectrics are opaque
-            if (!is_black(params.transmission_tint))
-                return Color3(1);
-            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist, false> mf(params,
-                                                                        1.0f);
-            return w * mf.get_albedo(-sg.I);
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = Color3(1.0f);
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL:
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        case ClosureColor::ADD:
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        default: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 w                     = comp->w;
+            switch (comp->id) {
+            case MX_LAYER_ID: {
+                const MxLayerParams* srcparams = comp->as<MxLayerParams>();
+                closure                        = srcparams->top;
+                ptr_stack[stack_idx]           = srcparams->base;
+                weight_stack[stack_idx++]      = weight * w;
+                break;
+            }
+            case REFLECTION_ID:
+            case FRESNEL_REFLECTION_ID: {
+                Reflection bsdf(*comp->as<ReflectionParams>());
+                weight *= w * bsdf.get_albedo(-sg.I);
+                closure = nullptr;
+                break;
+            }
+            case MX_DIELECTRIC_ID: {
+                const MxDielectricParams& params
+                    = *comp->as<MxDielectricParams>();
+                // Transmissive dielectrics are opaque
+                if (!is_black(params.transmission_tint)) {
+                    closure = nullptr;
+                    break;
+                }
+                MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, false>
+                    mf(params, 1.0f);
+                weight *= w * mf.get_albedo(-sg.I);
+                closure = nullptr;
+                break;
+            }
+            case MX_GENERALIZED_SCHLICK_ID: {
+                const MxGeneralizedSchlickParams& params
+                    = *comp->as<MxGeneralizedSchlickParams>();
+                // Transmissive dielectrics are opaque
+                if (!is_black(params.transmission_tint)) {
+                    closure = nullptr;
+                    break;
+                }
+                MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                             MX_GENERALIZED_SCHLICK_ID, false>
+                    mf(params, 1.0f);
+                weight *= w * mf.get_albedo(-sg.I);
+                closure = nullptr;
+                break;
+            }
+            case MX_SHEEN_ID: {
+                const MxSheenParams& params = *comp->as<MxSheenParams>();
+                if (params.mode == 1) {
+                    weight *= w * ZeltnerBurleySheen(params).get_albedo(-sg.I);
+                } else {
+                    // otherwise, default to old sheen model
+                    weight *= w * CharlieSheen(params).get_albedo(-sg.I);
+                }
+                closure = nullptr;
+                break;
+            }
+            default:  // Assume unhandled BSDFs are opaque
+                closure = nullptr;
+                break;
+            }
         }
-        case MX_SHEEN_ID: {
-            const MxSheenParams& params = *comp->as<MxSheenParams>();
-            if (params.mode == 1)
-                return w * ZeltnerBurleySheen(params).get_albedo(-sg.I);
-            // otherwise, default to old sheen model
-            return w * CharlieSheen(params).get_albedo(-sg.I);
         }
-        default:  // Assume unhandled BSDFs are opaque
-            return Color3(1);
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
         }
     }
-    }
-    OSL_ASSERT(false && "Layer opacity evaluation failed");
-    return Color3(0);
+    return weight;
 }
 
-void
-process_medium_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
+OSL_HOSTDEVICE void
+process_medium_closure(const ShaderGlobalsType& sg, ShadingResult& result,
                        const ClosureColor* closure, const Color3& w)
 {
     if (!closure)
         return;
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        process_medium_closure(sg, result, closure->as_mul()->closure,
-                               w * closure->as_mul()->weight);
-        break;
-    }
-    case ClosureColor::ADD: {
-        process_medium_closure(sg, result, closure->as_add()->closureA, w);
-        process_medium_closure(sg, result, closure->as_add()->closureB, w);
-        break;
-    }
-    case MX_LAYER_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const MxLayerParams* params  = comp->as<MxLayerParams>();
-        Color3 base_w
-            = w
-              * (Color3(1)
-                 - clamp(evaluate_layer_opacity(sg, params->top), 0.f, 1.f));
-        process_medium_closure(sg, result, params->top, w);
-        process_medium_closure(sg, result, params->base, base_w);
-        break;
-    }
-    case MX_ANISOTROPIC_VDF_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        const auto& params           = *comp->as<MxAnisotropicVdfParams>();
-        result.sigma_t               = cw * params.extinction;
-        result.sigma_s               = params.albedo * result.sigma_t;
-        result.medium_g              = params.anisotropy;
-        break;
-    }
-    case MX_MEDIUM_VDF_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        const auto& params           = *comp->as<MxMediumVdfParams>();
-        result.sigma_t = { -OIIO::fast_log(params.transmission_color.x),
-                           -OIIO::fast_log(params.transmission_color.y),
-                           -OIIO::fast_log(params.transmission_color.z) };
-        // NOTE: closure weight scales the extinction parameter
-        result.sigma_t *= cw / params.transmission_depth;
-        result.sigma_s  = params.albedo * result.sigma_t;
-        result.medium_g = params.anisotropy;
-        // TODO: properly track a medium stack here ...
-        result.refraction_ior = sg.backfacing ? 1.0f / params.ior : params.ior;
-        result.priority       = params.priority;
-        break;
-    }
-    case MX_DIELECTRIC_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const auto& params           = *comp->as<MxDielectricParams>();
-        if (!is_black(w * comp->w * params.transmission_tint)) {
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = w;
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            weight_stack[stack_idx] = weight;
+            ptr_stack[stack_idx++]  = closure->as_add()->closureB;
+            closure                 = closure->as_add()->closureA;
+            break;
+        }
+        case MX_LAYER_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            const MxLayerParams* params  = comp->as<MxLayerParams>();
+            Color3 base_w                = weight
+                            * (Color3(1)
+                               - clamp(evaluate_layer_opacity(sg, params->top),
+                                       0.f, 1.f));
+            closure                   = params->top;
+            ptr_stack[stack_idx]      = params->base;
+            weight_stack[stack_idx++] = weight * base_w;
+            break;
+        }
+        case MX_ANISOTROPIC_VDF_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            const auto& params           = *comp->as<MxAnisotropicVdfParams>();
+            result.sigma_t               = cw * params.extinction;
+            result.sigma_s               = params.albedo * result.sigma_t;
+            result.medium_g              = params.anisotropy;
+            closure                      = nullptr;
+            break;
+        }
+        case MX_MEDIUM_VDF_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            const auto& params           = *comp->as<MxMediumVdfParams>();
+            result.sigma_t = { -OIIO::fast_log(params.transmission_color.x),
+                               -OIIO::fast_log(params.transmission_color.y),
+                               -OIIO::fast_log(params.transmission_color.z) };
+            // NOTE: closure weight scales the extinction parameter
+            result.sigma_t *= cw / params.transmission_depth;
+            result.sigma_s  = params.albedo * result.sigma_t;
+            result.medium_g = params.anisotropy;
             // TODO: properly track a medium stack here ...
             result.refraction_ior = sg.backfacing ? 1.0f / params.ior
                                                   : params.ior;
+            result.priority       = params.priority;
+            closure               = nullptr;
+            break;
         }
-        break;
-    }
-    case MX_GENERALIZED_SCHLICK_ID: {
-        const ClosureComponent* comp = closure->as_comp();
-        const auto& params           = *comp->as<MxGeneralizedSchlickParams>();
-        if (!is_black(w * comp->w * params.transmission_tint)) {
-            // TODO: properly track a medium stack here ...
-            float avg_F0  = clamp((params.f0.x + params.f0.y + params.f0.z)
-                                      / 3.0f,
-                                  0.0f, 0.99f);
-            float sqrt_F0 = sqrtf(avg_F0);
-            float ior     = (1 + sqrt_F0) / (1 - sqrt_F0);
-            result.refraction_ior = sg.backfacing ? 1.0f / ior : ior;
+        case MX_DIELECTRIC_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            const auto& params           = *comp->as<MxDielectricParams>();
+            if (!is_black(weight * comp->w * params.transmission_tint)) {
+                // TODO: properly track a medium stack here ...
+                result.refraction_ior = sg.backfacing ? 1.0f / params.ior
+                                                      : params.ior;
+            }
+            closure = nullptr;
+            break;
+        }
+        case MX_GENERALIZED_SCHLICK_ID: {
+            const ClosureComponent* comp = closure->as_comp();
+            const auto& params = *comp->as<MxGeneralizedSchlickParams>();
+            if (!is_black(weight * comp->w * params.transmission_tint)) {
+                // TODO: properly track a medium stack here ...
+                float avg_F0  = clamp((params.f0.x + params.f0.y + params.f0.z)
+                                          / 3.0f,
+                                      0.0f, 0.99f);
+                float sqrt_F0 = sqrtf(avg_F0);
+                float ior     = (1 + sqrt_F0) / (1 - sqrt_F0);
+                result.refraction_ior = sg.backfacing ? 1.0f / ior : ior;
+            }
+            closure = nullptr;
+            break;
+        }
+        default: closure = nullptr; break;
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
         }
-        break;
-    }
     }
 }
 
 // recursively walk through the closure tree, creating bsdfs as we go
-void
-process_bsdf_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
+OSL_HOSTDEVICE void
+process_bsdf_closure(const ShaderGlobalsType& sg, ShadingResult& result,
                      const ClosureColor* closure, const Color3& w,
                      bool light_only)
 {
@@ -1709,242 +1609,605 @@ process_bsdf_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
     static const ustringhash uh_default("default");
     if (!closure)
         return;
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        Color3 cw = w * closure->as_mul()->weight;
-        process_bsdf_closure(sg, result, closure->as_mul()->closure, cw,
-                             light_only);
-        break;
-    }
-    case ClosureColor::ADD: {
-        process_bsdf_closure(sg, result, closure->as_add()->closureA, w,
-                             light_only);
-        process_bsdf_closure(sg, result, closure->as_add()->closureB, w,
-                             light_only);
-        break;
-    }
-    default: {
-        const ClosureComponent* comp = closure->as_comp();
-        Color3 cw                    = w * comp->w;
-        if (comp->id == EMISSION_ID)
-            result.Le += cw;
-        else if (comp->id == MX_UNIFORM_EDF_ID)
-            result.Le += cw * comp->as<MxUniformEdfParams>()->emittance;
-        else if (!light_only) {
-            bool ok = false;
-            switch (comp->id) {
-            case DIFFUSE_ID:
-                ok = result.bsdf.add_bsdf<Diffuse<0>>(
-                    cw, *comp->as<DiffuseParams>());
-                break;
-            case OREN_NAYAR_ID:
-                ok = result.bsdf.add_bsdf<OrenNayar>(
-                    cw, *comp->as<OrenNayarParams>());
-                break;
-            case TRANSLUCENT_ID:
-                ok = result.bsdf.add_bsdf<Diffuse<1>>(
-                    cw, *comp->as<DiffuseParams>());
-                break;
-            case PHONG_ID:
-                ok = result.bsdf.add_bsdf<Phong>(cw, *comp->as<PhongParams>());
-                break;
-            case WARD_ID:
-                ok = result.bsdf.add_bsdf<Ward>(cw, *comp->as<WardParams>());
-                break;
-            case MICROFACET_ID: {
-                const MicrofacetParams* mp = comp->as<MicrofacetParams>();
-                if (mp->dist == uh_ggx) {
-                    switch (mp->refract) {
-                    case 0:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXRefl>(cw, *mp);
-                        break;
-                    case 1:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXRefr>(cw, *mp);
-                        break;
-                    case 2:
-                        ok = result.bsdf.add_bsdf<MicrofacetGGXBoth>(cw, *mp);
-                        break;
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = w;
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        }
+        default: {
+            const ClosureComponent* comp = closure->as_comp();
+            Color3 cw                    = weight * comp->w;
+            closure                      = nullptr;
+            if (comp->id == EMISSION_ID)
+                result.Le += cw;
+            else if (comp->id == MX_UNIFORM_EDF_ID)
+                result.Le += cw * comp->as<MxUniformEdfParams>()->emittance;
+            else if (!light_only) {
+                bool ok = false;
+                switch (comp->id) {
+                case DIFFUSE_ID:
+                    ok = result.bsdf.add_bsdf<Diffuse<0>>(
+                        cw, *comp->as<DiffuseParams>());
+                    break;
+                case OREN_NAYAR_ID:
+                    ok = result.bsdf.add_bsdf<OrenNayar>(
+                        cw, *comp->as<OrenNayarParams>());
+                    break;
+                case TRANSLUCENT_ID:
+                    ok = result.bsdf.add_bsdf<Diffuse<1>>(
+                        cw, *comp->as<DiffuseParams>());
+                    break;
+                case PHONG_ID:
+                    ok = result.bsdf.add_bsdf<Phong>(cw,
+                                                     *comp->as<PhongParams>());
+                    break;
+                case WARD_ID:
+                    ok = result.bsdf.add_bsdf<Ward>(cw,
+                                                    *comp->as<WardParams>());
+                    break;
+                case MICROFACET_ID: {
+                    const MicrofacetParams* mp = comp->as<MicrofacetParams>();
+                    if (mp->dist == uh_ggx) {
+                        switch (mp->refract) {
+                        case 0:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXRefl>(cw,
+                                                                         *mp);
+                            break;
+                        case 1:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXRefr>(cw,
+                                                                         *mp);
+                            break;
+                        case 2:
+                            ok = result.bsdf.add_bsdf<MicrofacetGGXBoth>(cw,
+                                                                         *mp);
+                            break;
+                        }
+                    } else if (mp->dist == uh_beckmann
+                               || mp->dist == uh_default) {
+                        switch (mp->refract) {
+                        case 0:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefl>(
+                                cw, *mp);
+                            break;
+                        case 1:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefr>(
+                                cw, *mp);
+                            break;
+                        case 2:
+                            ok = result.bsdf.add_bsdf<MicrofacetBeckmannBoth>(
+                                cw, *mp);
+                            break;
+                        }
                     }
-                } else if (mp->dist == uh_beckmann || mp->dist == uh_default) {
-                    switch (mp->refract) {
-                    case 0:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefl>(cw,
-                                                                          *mp);
-                        break;
-                    case 1:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannRefr>(cw,
-                                                                          *mp);
-                        break;
-                    case 2:
-                        ok = result.bsdf.add_bsdf<MicrofacetBeckmannBoth>(cw,
-                                                                          *mp);
-                        break;
+                    break;
+                }
+                case REFLECTION_ID:
+                case FRESNEL_REFLECTION_ID:
+                    ok = result.bsdf.add_bsdf<Reflection>(
+                        cw, *comp->as<ReflectionParams>());
+                    break;
+                case REFRACTION_ID:
+                    ok = result.bsdf.add_bsdf<Refraction>(
+                        cw, *comp->as<RefractionParams>());
+                    break;
+                case TRANSPARENT_ID:
+                    ok = result.bsdf.add_bsdf<Transparent>(cw);
+                    break;
+                case MX_OREN_NAYAR_DIFFUSE_ID: {
+                    const MxOrenNayarDiffuseParams* srcparams
+                        = comp->as<MxOrenNayarDiffuseParams>();
+                    if (srcparams->energy_compensation) {
+                        // energy compensation handled by its own BSDF
+                        ok = result.bsdf.add_bsdf<EnergyCompensatedOrenNayar>(
+                            cw, *srcparams);
+                    } else {
+                        // translate MaterialX parameters into existing closure
+                        OrenNayarParams params = {};
+                        params.N               = srcparams->N;
+                        params.sigma           = srcparams->roughness;
+                        ok = result.bsdf.add_bsdf<OrenNayar>(
+                            cw * srcparams->albedo, params);
                     }
+                    break;
                 }
-                break;
-            }
-            case REFLECTION_ID:
-            case FRESNEL_REFLECTION_ID:
-                ok = result.bsdf.add_bsdf<Reflection>(
-                    cw, *comp->as<ReflectionParams>());
-                break;
-            case REFRACTION_ID:
-                ok = result.bsdf.add_bsdf<Refraction>(
-                    cw, *comp->as<RefractionParams>());
-                break;
-            case TRANSPARENT_ID:
-                ok = result.bsdf.add_bsdf<Transparent>(cw);
-                break;
-            case MX_OREN_NAYAR_DIFFUSE_ID: {
-                const MxOrenNayarDiffuseParams* srcparams
-                    = comp->as<MxOrenNayarDiffuseParams>();
-                if (srcparams->energy_compensation) {
-                    // energy compensation handled by its own BSDF
-                    ok = result.bsdf.add_bsdf<EnergyCompensatedOrenNayar>(
-                        cw, *srcparams);
-                } else {
-                    // translate MaterialX parameters into existing closure
-                    OrenNayarParams params = {};
-                    params.N               = srcparams->N;
-                    params.sigma           = srcparams->roughness;
-                    ok = result.bsdf.add_bsdf<OrenNayar>(cw * srcparams->albedo,
-                                                         params);
+                case MX_BURLEY_DIFFUSE_ID: {
+                    const MxBurleyDiffuseParams& params
+                        = *comp->as<MxBurleyDiffuseParams>();
+                    ok = result.bsdf.add_bsdf<MxBurleyDiffuse>(cw, params);
+                    break;
                 }
-                break;
+                case MX_DIELECTRIC_ID: {
+                    const MxDielectricParams& params
+                        = *comp->as<MxDielectricParams>();
+                    if (is_black(params.transmission_tint))
+                        ok = result.bsdf.add_bsdf<
+                            MxMicrofacet<MxDielectricParams, GGXDist,
+                                         MX_DIELECTRIC_ID, false>>(cw, params,
+                                                                   1.0f);
+                    else
+                        ok = result.bsdf.add_bsdf<MxMicrofacet<
+                            MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, true>>(
+                            cw, params, result.refraction_ior);
+                    break;
+                }
+                case MX_CONDUCTOR_ID: {
+                    const MxConductorParams& params
+                        = *comp->as<MxConductorParams>();
+                    ok = result.bsdf.add_bsdf<MxMicrofacet<
+                        MxConductorParams, GGXDist, MX_CONDUCTOR_ID, false>>(
+                        cw, params, 1.0f);
+                    break;
+                };
+                case MX_GENERALIZED_SCHLICK_ID: {
+                    const MxGeneralizedSchlickParams& params
+                        = *comp->as<MxGeneralizedSchlickParams>();
+                    if (is_black(params.transmission_tint))
+                        ok = result.bsdf.add_bsdf<
+                            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                                         MX_GENERALIZED_SCHLICK_ID, false>>(
+                            cw, params, 1.0f);
+                    else
+                        ok = result.bsdf.add_bsdf<
+                            MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                                         MX_GENERALIZED_SCHLICK_ID, true>>(
+                            cw, params, result.refraction_ior);
+                    break;
+                };
+                case MX_TRANSLUCENT_ID: {
+                    const MxTranslucentParams* srcparams
+                        = comp->as<MxTranslucentParams>();
+                    DiffuseParams params = {};
+                    params.N             = srcparams->N;
+                    ok = result.bsdf.add_bsdf<Diffuse<1>>(cw * srcparams->albedo,
+                                                          params);
+                    break;
+                }
+                case MX_TRANSPARENT_ID: {
+                    ok = result.bsdf.add_bsdf<Transparent>(cw);
+                    break;
+                }
+                case MX_SUBSURFACE_ID: {
+                    // TODO: implement BSSRDF support?
+                    const MxSubsurfaceParams* srcparams
+                        = comp->as<MxSubsurfaceParams>();
+                    DiffuseParams params = {};
+                    params.N             = srcparams->N;
+                    ok = result.bsdf.add_bsdf<Diffuse<0>>(cw * srcparams->albedo,
+                                                          params);
+                    break;
+                }
+                case MX_SHEEN_ID: {
+                    const MxSheenParams& params = *comp->as<MxSheenParams>();
+                    if (params.mode == 1)
+                        ok = result.bsdf.add_bsdf<ZeltnerBurleySheen>(cw,
+                                                                      params);
+                    else
+                        ok = result.bsdf.add_bsdf<CharlieSheen>(
+                            cw, params);  // default to legacy closure
+                    break;
+                }
+                case MX_LAYER_ID: {
+                    const MxLayerParams* srcparams = comp->as<MxLayerParams>();
+                    Color3 base_w
+                        = weight
+                          * (Color3(1, 1, 1)
+                             - clamp(evaluate_layer_opacity(sg, srcparams->top),
+                                     0.f, 1.f));
+                    closure = srcparams->top;
+                    weight  = cw;
+                    if (!is_black(base_w)) {
+                        ptr_stack[stack_idx]      = srcparams->base;
+                        weight_stack[stack_idx++] = base_w;
+                    }
+                    ok = true;
+                    break;
+                }
+                case MX_ANISOTROPIC_VDF_ID:
+                case MX_MEDIUM_VDF_ID: {
+                    // already processed by process_medium_closure
+                    ok = true;
+                    break;
+                }
+                }
+#ifndef __CUDACC__
+                OSL_ASSERT(ok && "Invalid closure invoked in surface shader");
+#else
+                // TODO: We should never get here, but we sometimes do, e.g. in
+                // the render-material-layer test.
+                if (false && !ok)
+                    printf("Invalid closure invoked in surface shader\n");
+#endif
             }
-            case MX_BURLEY_DIFFUSE_ID: {
-                const MxBurleyDiffuseParams& params
-                    = *comp->as<MxBurleyDiffuseParams>();
-                ok = result.bsdf.add_bsdf<MxBurleyDiffuse>(cw, params);
+            break;
+        }
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
+        }
+    }
+}
+
+}  // anonymous namespace
+
+OSL_NAMESPACE_ENTER
+
+OSL_HOSTDEVICE void
+process_closure(const ShaderGlobalsType& sg, ShadingResult& result,
+                const ClosureColor* Ci, bool light_only)
+{
+    if (!light_only)
+        process_medium_closure(sg, result, Ci, Color3(1));
+    process_bsdf_closure(sg, result, Ci, Color3(1), light_only);
+}
+
+OSL_HOSTDEVICE Vec3
+process_background_closure(const ClosureColor* closure)
+{
+    if (!closure)
+        return Vec3(0, 0, 0);
+
+    // Non-recursive traversal stack
+    const int STACK_SIZE = 16;
+    int stack_idx        = 0;
+    const ClosureColor* ptr_stack[STACK_SIZE];
+    Color3 weight_stack[STACK_SIZE];
+    Color3 weight = Color3(1.0f);
+
+    while (closure) {
+        switch (closure->id) {
+        case ClosureColor::MUL: {
+            weight *= closure->as_mul()->weight;
+            closure = closure->as_mul()->closure;
+            break;
+        }
+        case ClosureColor::ADD: {
+            ptr_stack[stack_idx]      = closure->as_add()->closureB;
+            weight_stack[stack_idx++] = weight;
+            closure                   = closure->as_add()->closureA;
+            break;
+        }
+        case BACKGROUND_ID: {
+            weight *= closure->as_comp()->w;
+            closure = nullptr;
+            break;
+        }
+        }
+        if (closure == nullptr && stack_idx > 0) {
+            closure = ptr_stack[--stack_idx];
+            weight  = weight_stack[stack_idx];
+        }
+    }
+    return weight;
+}
+
+
+typedef MxMicrofacet<MxConductorParams, GGXDist, MX_CONDUCTOR_ID, false>
+    MxConductor;
+typedef MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, true>
+    MxDielectric;
+typedef MxMicrofacet<MxDielectricParams, GGXDist, MX_DIELECTRIC_ID, false>
+    MxDielectricOpaque;
+typedef MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                     MX_GENERALIZED_SCHLICK_ID, true>
+    MxGeneralizedSchlick;
+typedef MxMicrofacet<MxGeneralizedSchlickParams, GGXDist,
+                     MX_GENERALIZED_SCHLICK_ID, false>
+    MxGeneralizedSchlickOpaque;
+
+OSL_HOSTDEVICE Color3
+CompositeBSDF::get_albedo(const BSDF* bsdf, const Vec3& wo) const
+{
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
+
+    Color3 albedo(0);
+    switch (bsdf->id) {
+    case DIFFUSE_ID:
+        albedo = BSDF_CAST(Diffuse<0>, bsdf)->get_albedo(wo);
+        break;
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        albedo = BSDF_CAST(Transparent, bsdf)->get_albedo(wo);
+        break;
+    case OREN_NAYAR_ID:
+        albedo = BSDF_CAST(OrenNayar, bsdf)->get_albedo(wo);
+        break;
+    case TRANSLUCENT_ID:
+        albedo = BSDF_CAST(Diffuse<1>, bsdf)->get_albedo(wo);
+        break;
+    case PHONG_ID: albedo = BSDF_CAST(Phong, bsdf)->get_albedo(wo); break;
+    case WARD_ID: albedo = BSDF_CAST(Ward, bsdf)->get_albedo(wo); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        albedo = BSDF_CAST(Reflection, bsdf)->get_albedo(wo);
+        break;
+    case REFRACTION_ID:
+        albedo = BSDF_CAST(Refraction, bsdf)->get_albedo(wo);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                albedo = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->get_albedo(wo);
                 break;
-            }
-            case MX_DIELECTRIC_ID: {
-                const MxDielectricParams& params
-                    = *comp->as<MxDielectricParams>();
-                if (is_black(params.transmission_tint))
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxDielectricParams, GGXDist, false>>(
-                        cw, params, 1.0f);
-                else
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxDielectricParams, GGXDist, true>>(
-                        cw, params, result.refraction_ior);
+            case 1:
+                albedo = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->get_albedo(wo);
+                break;
+            case 2:
+                albedo = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->get_albedo(wo);
                 break;
             }
-            case MX_CONDUCTOR_ID: {
-                const MxConductorParams& params = *comp->as<MxConductorParams>();
-                ok = result.bsdf.add_bsdf<
-                    MxMicrofacet<MxConductorParams, GGXDist, false>>(cw, params,
-                                                                     1.0f);
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                albedo = BSDF_CAST(MicrofacetGGXRefl, bsdf)->get_albedo(wo);
                 break;
-            };
-            case MX_GENERALIZED_SCHLICK_ID: {
-                const MxGeneralizedSchlickParams& params
-                    = *comp->as<MxGeneralizedSchlickParams>();
-                if (is_black(params.transmission_tint))
-                    ok = result.bsdf.add_bsdf<MxMicrofacet<
-                        MxGeneralizedSchlickParams, GGXDist, false>>(cw, params,
-                                                                     1.0f);
-                else
-                    ok = result.bsdf.add_bsdf<
-                        MxMicrofacet<MxGeneralizedSchlickParams, GGXDist, true>>(
-                        cw, params, result.refraction_ior);
+            case 1:
+                albedo = BSDF_CAST(MicrofacetGGXRefr, bsdf)->get_albedo(wo);
                 break;
-            };
-            case MX_TRANSLUCENT_ID: {
-                const MxTranslucentParams* srcparams
-                    = comp->as<MxTranslucentParams>();
-                DiffuseParams params = {};
-                params.N             = srcparams->N;
-                ok = result.bsdf.add_bsdf<Diffuse<1>>(cw * srcparams->albedo,
-                                                      params);
+            case 2:
+                albedo = BSDF_CAST(MicrofacetGGXBoth, bsdf)->get_albedo(wo);
                 break;
             }
-            case MX_TRANSPARENT_ID: {
-                ok = result.bsdf.add_bsdf<Transparent>(cw);
+        }
+        break;
+    }
+    case MX_CONDUCTOR_ID:
+        albedo = BSDF_CAST(MxConductor, bsdf)->get_albedo(wo);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            albedo = BSDF_CAST(MxDielectricOpaque, bsdf)->get_albedo(wo);
+        else
+            albedo = BSDF_CAST(MxDielectric, bsdf)->get_albedo(wo);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        albedo = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->get_albedo(wo);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        albedo = BSDF_CAST(MxBurleyDiffuse, bsdf)->get_albedo(wo);
+        break;
+    case MX_SHEEN_ID:
+        if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1)
+            albedo = BSDF_CAST(ZeltnerBurleySheen, bsdf)->get_albedo(wo);
+        else
+            albedo = BSDF_CAST(CharlieSheen, bsdf)->get_albedo(wo);
+        break;
+    case MX_GENERALIZED_SCHLICK_ID: {
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint))
+            albedo = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->get_albedo(wo);
+        else
+            albedo = BSDF_CAST(MxGeneralizedSchlick, bsdf)->get_albedo(wo);
+        break;
+    }
+    default: break;
+    }
+    return albedo;
+}
+
+
+OSL_HOSTDEVICE BSDF::Sample
+CompositeBSDF::sample(const BSDF* bsdf, const Vec3& wo, float rx, float ry,
+                      float rz) const
+{
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
+
+    BSDF::Sample sample = {};
+    switch (bsdf->id) {
+    case DIFFUSE_ID:
+        sample = BSDF_CAST(Diffuse<0>, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        sample = BSDF_CAST(Transparent, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case OREN_NAYAR_ID:
+        sample = BSDF_CAST(OrenNayar, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case TRANSLUCENT_ID:
+        sample = BSDF_CAST(Diffuse<1>, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case PHONG_ID:
+        sample = BSDF_CAST(Phong, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->sample(wo, rx, ry, rz); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        sample = BSDF_CAST(Reflection, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case REFRACTION_ID:
+        sample = BSDF_CAST(Refraction, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
-            }
-            case MX_SUBSURFACE_ID: {
-                // TODO: implement BSSRDF support?
-                const MxSubsurfaceParams* srcparams
-                    = comp->as<MxSubsurfaceParams>();
-                DiffuseParams params = {};
-                params.N             = srcparams->N;
-                ok = result.bsdf.add_bsdf<Diffuse<0>>(cw * srcparams->albedo,
-                                                      params);
+            case 1:
+                sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
-            }
-            case MX_SHEEN_ID: {
-                const MxSheenParams& params = *comp->as<MxSheenParams>();
-                if (params.mode == 1)
-                    ok = result.bsdf.add_bsdf<ZeltnerBurleySheen>(cw, params);
-                else
-                    ok = result.bsdf.add_bsdf<CharlieSheen>(
-                        cw, params);  // default to legacy closure
+            case 2:
+                sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)
+                             ->sample(wo, rx, ry, rz);
                 break;
             }
-            case MX_LAYER_ID: {
-                const MxLayerParams* srcparams = comp->as<MxLayerParams>();
-                Color3 base_w
-                    = w
-                      * (Color3(1, 1, 1)
-                         - clamp(evaluate_layer_opacity(sg, srcparams->top),
-                                 0.f, 1.f));
-                process_bsdf_closure(sg, result, srcparams->top, w, light_only);
-                if (!is_black(base_w))
-                    process_bsdf_closure(sg, result, srcparams->base, base_w,
-                                         light_only);
-                ok = true;
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                sample
+                    = BSDF_CAST(MicrofacetGGXRefl, bsdf)->sample(wo, rx, ry, rz);
                 break;
-            }
-            case MX_ANISOTROPIC_VDF_ID:
-            case MX_MEDIUM_VDF_ID: {
-                // already processed by process_medium_closure
-                ok = true;
+            case 1:
+                sample
+                    = BSDF_CAST(MicrofacetGGXRefr, bsdf)->sample(wo, rx, ry, rz);
+                break;
+            case 2:
+                sample
+                    = BSDF_CAST(MicrofacetGGXBoth, bsdf)->sample(wo, rx, ry, rz);
                 break;
             }
-            }
-            OSL_ASSERT(ok && "Invalid closure invoked in surface shader");
         }
         break;
     }
+    case MX_CONDUCTOR_ID:
+        sample = BSDF_CAST(MxConductor, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            sample = BSDF_CAST(MxDielectricOpaque, bsdf)->sample(wo, rx, ry, rz);
+        else
+            sample = BSDF_CAST(MxDielectric, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)
+                     ->sample(wo, rx, ry, rz);
+        break;
+    case MX_SHEEN_ID:
+        if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1)
+            sample = BSDF_CAST(ZeltnerBurleySheen, bsdf)->sample(wo, rx, ry, rz);
+        else
+            sample = BSDF_CAST(CharlieSheen, bsdf)->sample(wo, rx, ry, rz);
+        break;
+    case MX_GENERALIZED_SCHLICK_ID: {
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint)) {
+            sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)
+                         ->sample(wo, rx, ry, rz);
+        } else {
+            sample
+                = BSDF_CAST(MxGeneralizedSchlick, bsdf)->sample(wo, rx, ry, rz);
+        }
+        break;
+    }
+    default: break;
     }
+    return sample;
 }
 
-}  // anonymous namespace
 
-OSL_NAMESPACE_ENTER
-
-void
-process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
-                const ClosureColor* Ci, bool light_only)
+OSL_HOSTDEVICE BSDF::Sample
+CompositeBSDF::eval(const BSDF* bsdf, const Vec3& wo, const Vec3& wi) const
 {
-    if (!light_only)
-        process_medium_closure(sg, result, Ci, Color3(1));
-    process_bsdf_closure(sg, result, Ci, Color3(1), light_only);
-}
+    static const ustringhash uh_ggx("ggx");
+    static const ustringhash uh_beckmann("beckmann");
+    static const ustringhash uh_default("default");
 
-Vec3
-process_background_closure(const ClosureColor* closure)
-{
-    if (!closure)
-        return Vec3(0, 0, 0);
-    switch (closure->id) {
-    case ClosureColor::MUL: {
-        return closure->as_mul()->weight
-               * process_background_closure(closure->as_mul()->closure);
-    }
-    case ClosureColor::ADD: {
-        return process_background_closure(closure->as_add()->closureA)
-               + process_background_closure(closure->as_add()->closureB);
+    BSDF::Sample sample = {};
+    switch (bsdf->id) {
+    case DIFFUSE_ID: sample = BSDF_CAST(Diffuse<0>, bsdf)->eval(wo, wi); break;
+    case TRANSPARENT_ID:
+    case MX_TRANSPARENT_ID:
+        sample = BSDF_CAST(Transparent, bsdf)->eval(wo, wi);
+        break;
+    case OREN_NAYAR_ID:
+        sample = BSDF_CAST(OrenNayar, bsdf)->eval(wo, wi);
+        break;
+    case TRANSLUCENT_ID:
+        sample = BSDF_CAST(Diffuse<1>, bsdf)->eval(wo, wi);
+        break;
+    case PHONG_ID: sample = BSDF_CAST(Phong, bsdf)->eval(wo, wi); break;
+    case WARD_ID: sample = BSDF_CAST(Ward, bsdf)->eval(wo, wi); break;
+    case REFLECTION_ID:
+    case FRESNEL_REFLECTION_ID:
+        sample = BSDF_CAST(Reflection, bsdf)->eval(wo, wi);
+        break;
+    case REFRACTION_ID:
+        sample = BSDF_CAST(Refraction, bsdf)->eval(wo, wi);
+        break;
+    case MICROFACET_ID: {
+        const int refract      = ((MicrofacetBeckmannRefl*)bsdf)->refract;
+        const ustringhash dist = ((MicrofacetBeckmannRefl*)bsdf)->dist;
+        if (dist == uh_default || dist == uh_beckmann) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetBeckmannRefl, bsdf)->eval(wo, wi);
+                break;
+            case 1:
+                sample = BSDF_CAST(MicrofacetBeckmannRefr, bsdf)->eval(wo, wi);
+                break;
+            case 2:
+                sample = BSDF_CAST(MicrofacetBeckmannBoth, bsdf)->eval(wo, wi);
+                break;
+            }
+        } else if (dist == uh_ggx) {
+            switch (refract) {
+            case 0:
+                sample = BSDF_CAST(MicrofacetGGXRefl, bsdf)->eval(wo, wi);
+                break;
+            case 1:
+                sample = BSDF_CAST(MicrofacetGGXRefr, bsdf)->eval(wo, wi);
+                break;
+            case 2:
+                sample = BSDF_CAST(MicrofacetGGXBoth, bsdf)->eval(wo, wi);
+                break;
+            }
+        }
+        break;
     }
-    case BACKGROUND_ID: {
-        return closure->as_comp()->w;
+    case MX_CONDUCTOR_ID:
+        sample = BSDF_CAST(MxConductor, bsdf)->eval(wo, wi);
+        break;
+    case MX_DIELECTRIC_ID:
+        if (is_black(((MxDielectricOpaque*)bsdf)->transmission_tint))
+            sample = BSDF_CAST(MxDielectricOpaque, bsdf)->eval(wo, wi);
+        else
+            sample = BSDF_CAST(MxDielectric, bsdf)->eval(wo, wi);
+        break;
+    case MX_BURLEY_DIFFUSE_ID:
+        sample = BSDF_CAST(MxBurleyDiffuse, bsdf)->eval(wo, wi);
+        break;
+    case MX_OREN_NAYAR_DIFFUSE_ID:
+        sample = BSDF_CAST(EnergyCompensatedOrenNayar, bsdf)->eval(wo, wi);
+        break;
+    case MX_SHEEN_ID:
+        if (BSDF_CAST(CharlieSheen, bsdf)->mode == 1)
+            sample = BSDF_CAST(ZeltnerBurleySheen, bsdf)->eval(wo, wi);
+        else
+            sample = BSDF_CAST(CharlieSheen, bsdf)->eval(wo, wi);
+        break;
+    case MX_GENERALIZED_SCHLICK_ID: {
+        const Color3& tint = ((MxGeneralizedSchlick*)bsdf)->transmission_tint;
+        if (is_black(tint)) {
+            sample = BSDF_CAST(MxGeneralizedSchlickOpaque, bsdf)->eval(wo, wi);
+        } else {
+            sample = BSDF_CAST(MxGeneralizedSchlick, bsdf)->eval(wo, wi);
+        }
+        break;
     }
+    default: break;
     }
-    // should never happen
-    OSL_ASSERT(false && "Invalid closure invoked in background shader");
-    return Vec3(0, 0, 0);
+    return sample;
 }
 
 
diff --git a/src/testrender/shading.h b/src/testrender/shading.h
index d7fdb190d..729cd47eb 100644
--- a/src/testrender/shading.h
+++ b/src/testrender/shading.h
@@ -6,24 +6,256 @@
 #pragma once
 
 #include <OSL/dual_vec.h>
+#include <OSL/hashes.h>
 #include <OSL/oslclosure.h>
 #include <OSL/oslconfig.h>
 #include <OSL/oslexec.h>
+#include "optics.h"
 #include "sampling.h"
 
 
 OSL_NAMESPACE_ENTER
 
+
+enum ClosureIDs {
+    ADD               = -2,
+    MUL               = -1,
+    COMPONENT_BASE_ID = 0,
+    EMISSION_ID       = 1,
+    BACKGROUND_ID,
+    DIFFUSE_ID,
+    OREN_NAYAR_ID,
+    TRANSLUCENT_ID,
+    PHONG_ID,
+    WARD_ID,
+    MICROFACET_ID,
+    REFLECTION_ID,
+    FRESNEL_REFLECTION_ID,
+    REFRACTION_ID,
+    TRANSPARENT_ID,
+    DEBUG_ID,
+    HOLDOUT_ID,
+    // See MATERIALX_CLOSURES in stdosl.h
+    MX_OREN_NAYAR_DIFFUSE_ID,
+    MX_BURLEY_DIFFUSE_ID,
+    MX_DIELECTRIC_ID,
+    MX_CONDUCTOR_ID,
+    MX_GENERALIZED_SCHLICK_ID,
+    MX_TRANSLUCENT_ID,
+    MX_TRANSPARENT_ID,
+    MX_SUBSURFACE_ID,
+    MX_SHEEN_ID,
+    MX_UNIFORM_EDF_ID,
+    MX_ANISOTROPIC_VDF_ID,
+    MX_MEDIUM_VDF_ID,
+    MX_LAYER_ID,
+    // TODO: adding vdfs would require extending testrender with volume support ...
+    EMPTY_ID
+};
+
+
+namespace {  // anonymous namespace
+
+// these structures hold the parameters of each closure type
+// they will be contained inside ClosureComponent
+struct EmptyParams {};
+struct DiffuseParams {
+    Vec3 N;
+};
+struct OrenNayarParams {
+    Vec3 N;
+    float sigma;
+};
+struct PhongParams {
+    Vec3 N;
+    float exponent;
+};
+struct WardParams {
+    Vec3 N, T;
+    float ax, ay;
+};
+struct ReflectionParams {
+    Vec3 N;
+    float eta;
+};
+struct RefractionParams {
+    Vec3 N;
+    float eta;
+};
+struct MicrofacetParams {
+    ustringhash dist;
+    Vec3 N, U;
+    float xalpha, yalpha, eta;
+    int refract;
+};
+
+// MATERIALX_CLOSURES
+
+struct MxOrenNayarDiffuseParams {
+    Vec3 N;
+    Color3 albedo;
+    float roughness;
+    // optional
+    ustringhash label;
+    int energy_compensation;
+};
+
+struct MxBurleyDiffuseParams {
+    Vec3 N;
+    Color3 albedo;
+    float roughness;
+    // optional
+    ustringhash label;
+};
+
+// common to all MaterialX microfacet closures
+struct MxMicrofacetBaseParams {
+    Vec3 N, U;
+    float roughness_x;
+    float roughness_y;
+    ustringhash distribution;
+    // optional
+    ustringhash label;
+};
+
+struct MxDielectricParams : public MxMicrofacetBaseParams {
+    Color3 reflection_tint;
+    Color3 transmission_tint;
+    float ior;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE Color3 evalR(float cos_theta) const
+    {
+        return reflection_tint * fresnel_dielectric(cos_theta, ior);
+    }
+
+    OSL_HOSTDEVICE Color3 evalT(float cos_theta) const
+    {
+        return transmission_tint * (1.0f - fresnel_dielectric(cos_theta, ior));
+    }
+};
+
+struct MxConductorParams : public MxMicrofacetBaseParams {
+    Color3 ior;
+    Color3 extinction;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE Color3 evalR(float cos_theta) const
+    {
+        return fresnel_conductor(cos_theta, ior, extinction);
+    }
+
+    OSL_HOSTDEVICE Color3 evalT(float cos_theta) const { return Color3(0.0f); }
+
+    // Avoid function was declared but never referenced
+    // float get_ior() const
+    // {
+    //     return 0;  // no transmission possible
+    // }
+};
+
+struct MxGeneralizedSchlickParams : public MxMicrofacetBaseParams {
+    Color3 reflection_tint;
+    Color3 transmission_tint;
+    Color3 f0;
+    Color3 f90;
+    float exponent;
+    // optional
+    float thinfilm_thickness;
+    float thinfilm_ior;
+
+    OSL_HOSTDEVICE Color3 evalR(float cos_theta) const
+    {
+        return reflection_tint
+               * fresnel_generalized_schlick(cos_theta, f0, f90, exponent);
+    }
+
+    OSL_HOSTDEVICE Color3 evalT(float cos_theta) const
+    {
+        return transmission_tint
+               * (Color3(1.0f)
+                  - fresnel_generalized_schlick(cos_theta, f0, f90, exponent));
+    }
+};
+
+struct MxTranslucentParams {
+    Vec3 N;
+    Color3 albedo;
+    // optional
+    ustringhash label;
+};
+
+struct MxSubsurfaceParams {
+    Vec3 N;
+    Color3 albedo;
+    Color3 radius;
+    float anisotropy;
+    // optional
+    ustringhash label;
+};
+
+struct MxSheenParams {
+    Vec3 N;
+    Color3 albedo;
+    float roughness;
+    // optional
+    int mode;
+    ustringhash label;
+};
+
+struct MxUniformEdfParams {
+    Color3 emittance;
+    // optional
+    ustringhash label;
+};
+
+struct MxLayerParams {
+    OSL::ClosureColor* top;
+    OSL::ClosureColor* base;
+};
+
+struct MxAnisotropicVdfParams {
+    Color3 albedo;
+    Color3 extinction;
+    float anisotropy;
+    // optional
+    ustringhash label;
+};
+
+struct MxMediumVdfParams {
+    Color3 albedo;
+    float transmission_depth;
+    Color3 transmission_color;
+    float anisotropy;
+    float ior;
+    int priority;
+    // optional
+    ustringhash label;
+};
+
+}  // anonymous namespace
+
+
+// Cast a BSDF* to the specified sub-type
+#define BSDF_CAST(BSDF_TYPE, bsdf) reinterpret_cast<const BSDF_TYPE*>(bsdf)
+
 /// Individual BSDF (diffuse, phong, refraction, etc ...)
 /// Actual implementations of this class are private
 struct BSDF {
     struct Sample {
-        Sample() : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f) {}
-        Sample(const Sample& o)
+        OSL_HOSTDEVICE Sample()
+            : wi(0.0f), weight(0.0f), pdf(0.0f), roughness(0.0f)
+        {
+        }
+        OSL_HOSTDEVICE Sample(const Sample& o)
             : wi(o.wi), weight(o.weight), pdf(o.pdf), roughness(o.roughness)
         {
         }
-        Sample(Vec3 wi, Color3 w, float pdf, float r)
+        OSL_HOSTDEVICE Sample(Vec3 wi, Color3 w, float pdf, float r)
             : wi(wi), weight(w), pdf(pdf), roughness(r)
         {
         }
@@ -32,48 +264,82 @@ struct BSDF {
         float pdf;
         float roughness;
     };
-    BSDF() {}
-    virtual Color3 get_albedo(const Vec3& /*wo*/) const { return Color3(1); }
-    virtual Sample eval(const Vec3& wo, const Vec3& wi) const = 0;
-    virtual Sample sample(const Vec3& wo, float rx, float ry, float rz) const
-        = 0;
+    OSL_HOSTDEVICE BSDF(ClosureIDs id = EMPTY_ID) : id(id) {}
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& /*wo*/) const
+    {
+        return Color3(1);
+    }
+    OSL_HOSTDEVICE Sample eval(const Vec3& wo, const Vec3& wi) const
+    {
+        return {};
+    }
+    OSL_HOSTDEVICE Sample sample(const Vec3& wo, float rx, float ry,
+                                 float rz) const
+    {
+        return {};
+    }
+    ClosureIDs id;
+#ifdef __CUDACC__
+    // TODO: This is a total hack to avoid a misaligned address error
+    // that sometimes occurs with the EnergyCompensatedOrenNayar BSDF.
+    // It's not clear what the issue is or why this fixes it, but that
+    // will take a bit of digging.
+    int pad;
+#endif
 };
 
 /// Represents a weighted sum of BSDFS
 /// NOTE: no need to inherit from BSDF here because we use a "flattened" representation and therefore never nest these
 ///
 struct CompositeBSDF {
-    CompositeBSDF() : num_bsdfs(0), num_bytes(0) {}
+    OSL_HOSTDEVICE CompositeBSDF() : num_bsdfs(0), num_bytes(0) {}
 
-    void prepare(const Vec3& wo, const Color3& path_weight, bool absorb)
+    OSL_HOSTDEVICE void prepare(const Vec3& wo, const Color3& path_weight,
+                                bool absorb)
     {
         float total = 0;
         for (int i = 0; i < num_bsdfs; i++) {
-            pdfs[i] = weights[i].dot(path_weight * bsdfs[i]->get_albedo(wo))
+            pdfs[i] = weights[i].dot(path_weight * get_albedo(bsdfs[i], wo))
                       / (path_weight.x + path_weight.y + path_weight.z);
+#ifndef __CUDACC__
+            // TODO: Figure out what to do with weights/albedos with negative
+            //       components (e.g., as might happen when bipolar noise is
+            //       used as a color).
+
+            // The PDF is out-of-range in some test scenes on the CPU path, but
+            // these asserts are no-ops in release builds. The asserts are active
+            // on the CUDA path, so we need to skip them.
             assert(pdfs[i] >= 0);
             assert(pdfs[i] <= 1);
+#endif
             total += pdfs[i];
         }
         if ((!absorb && total > 0) || total > 1) {
-            for (int i = 0; i < num_bsdfs; i++)
+            for (int i = 0; i < num_bsdfs; i++) {
+#ifndef __CUDACC__
                 pdfs[i] /= total;
+#else
+                // TODO: This helps avoid NaNs, but it's not clear where the
+                // NaNs are coming from.
+                pdfs[i] = __fdiv_rz(pdfs[i], total);
+#endif
+            }
         }
     }
 
-    Color3 get_albedo(const Vec3& wo) const
+    OSL_HOSTDEVICE Color3 get_albedo(const Vec3& wo) const
     {
         Color3 result(0, 0, 0);
         for (int i = 0; i < num_bsdfs; i++)
-            result += weights[i] * bsdfs[i]->get_albedo(wo);
+            result += weights[i] * get_albedo(bsdfs[i], wo);
         return result;
     }
 
-    BSDF::Sample eval(const Vec3& wo, const Vec3& wi) const
+    OSL_HOSTDEVICE BSDF::Sample eval(const Vec3& wo, const Vec3& wi) const
     {
         BSDF::Sample s = {};
         for (int i = 0; i < num_bsdfs; i++) {
-            BSDF::Sample b = bsdfs[i]->eval(wo, wi);
+            BSDF::Sample b = eval(bsdfs[i], wo, wi);
             b.weight *= weights[i];
             MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf, pdfs[i]);
             s.roughness += b.roughness * pdfs[i];
@@ -81,14 +347,15 @@ struct CompositeBSDF {
         return s;
     }
 
-    BSDF::Sample sample(const Vec3& wo, float rx, float ry, float rz) const
+    OSL_HOSTDEVICE BSDF::Sample sample(const Vec3& wo, float rx, float ry,
+                                       float rz) const
     {
         float accum = 0;
         for (int i = 0; i < num_bsdfs; i++) {
             if (rx < (pdfs[i] + accum)) {
                 rx = (rx - accum) / pdfs[i];
                 rx = std::min(rx, 0.99999994f);  // keep result in [0,1)
-                BSDF::Sample s = bsdfs[i]->sample(wo, rx, ry, rz);
+                BSDF::Sample s = sample(bsdfs[i], wo, rx, ry, rz);
                 s.weight *= weights[i] * (1 / pdfs[i]);
                 s.pdf *= pdfs[i];
                 if (s.pdf == 0.0f)
@@ -96,7 +363,7 @@ struct CompositeBSDF {
                 // we sampled PDF i, now figure out how much the other bsdfs contribute to the chosen direction
                 for (int j = 0; j < num_bsdfs; j++) {
                     if (i != j) {
-                        BSDF::Sample b = bsdfs[j]->eval(wo, s.wi);
+                        BSDF::Sample b = eval(bsdfs[j], wo, s.wi);
                         b.weight *= weights[j];
                         MIS::update_eval(&s.weight, &s.pdf, b.weight, b.pdf,
                                          pdfs[j]);
@@ -110,7 +377,7 @@ struct CompositeBSDF {
     }
 
     template<typename BSDF_Type, typename... BSDF_Args>
-    bool add_bsdf(const Color3& w, BSDF_Args&&... args)
+    OSL_HOSTDEVICE bool add_bsdf(const Color3& w, BSDF_Args&&... args)
     {
         // make sure we have enough space
         if (num_bsdfs >= MaxEntries)
@@ -127,8 +394,14 @@ struct CompositeBSDF {
 
 private:
     /// Never try to copy this struct because it would invalidate the bsdf pointers
-    CompositeBSDF(const CompositeBSDF& c);
-    CompositeBSDF& operator=(const CompositeBSDF& c);
+    OSL_HOSTDEVICE CompositeBSDF(const CompositeBSDF& c);
+    OSL_HOSTDEVICE CompositeBSDF& operator=(const CompositeBSDF& c);
+
+    OSL_HOSTDEVICE Color3 get_albedo(const BSDF* bsdf, const Vec3& wo) const;
+    OSL_HOSTDEVICE BSDF::Sample eval(const BSDF* bsdf, const Vec3& wo,
+                                     const Vec3& wi) const;
+    OSL_HOSTDEVICE BSDF::Sample sample(const BSDF* bsdf, const Vec3& wo,
+                                       float rx, float ry, float rz) const;
 
     enum { MaxEntries = 8 };
     enum { MaxSize = 256 * sizeof(float) };
@@ -153,10 +426,10 @@ struct ShadingResult {
 
 void
 register_closures(ShadingSystem* shadingsys);
-void
+OSL_HOSTDEVICE void
 process_closure(const OSL::ShaderGlobals& sg, ShadingResult& result,
                 const ClosureColor* Ci, bool light_only);
-Vec3
+OSL_HOSTDEVICE Vec3
 process_background_closure(const ClosureColor* Ci);
 
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/simpleraytracer.cpp b/src/testrender/simpleraytracer.cpp
index f62196f93..7ad197547 100644
--- a/src/testrender/simpleraytracer.cpp
+++ b/src/testrender/simpleraytracer.cpp
@@ -2,20 +2,24 @@
 // SPDX-License-Identifier: BSD-3-Clause
 // https://github.com/AcademySoftwareFoundation/OpenShadingLanguage
 
-#include <OpenImageIO/filesystem.h>
-#include <OpenImageIO/parallel.h>
-#include <OpenImageIO/timer.h>
 
-#include <pugixml.hpp>
+#ifndef __CUDACC__
+#    include <OpenImageIO/filesystem.h>
+#    include <OpenImageIO/parallel.h>
+#    include <OpenImageIO/timer.h>
+#    include <pugixml.hpp>
 
-#ifdef USING_OIIO_PUGI
+#    ifdef USING_OIIO_PUGI
 namespace pugi = OIIO::pugi;
+#    endif
+
+#    include <OSL/hashes.h>
+#    include "raytracer.h"
+#    include "shading.h"
+#    include "simpleraytracer.h"
 #endif
 
-#include <OSL/hashes.h>
-#include "raytracer.h"
-#include "shading.h"
-#include "simpleraytracer.h"
+#include <cmath>
 
 // Create ustrings for all strings used by the free function renderer services.
 // Required to allow the reverse mapping of hash->string to work when processing messages
@@ -42,6 +46,15 @@ using namespace OSL;
 
 OSL_NAMESPACE_ENTER
 
+
+#ifndef __CUDACC__
+using ShaderGlobalsType = OSL::ShaderGlobals;
+#else
+using ShaderGlobalsType = OSL_CUDA::ShaderGlobals;
+#endif
+
+
+#ifndef __CUDACC__
 static TypeDesc TypeFloatArray2(TypeDesc::FLOAT, 2);
 static TypeDesc TypeFloatArray4(TypeDesc::FLOAT, 4);
 static TypeDesc TypeIntArray2(TypeDesc::INT, 2);
@@ -844,31 +857,40 @@ SimpleRaytracer::get_camera_screen_window(ShaderGlobals* /*sg*/, bool derivs,
     }
     return false;
 }
+#endif  // #ifndef __CUDACC__
 
 
 
-void
-SimpleRaytracer::globals_from_hit(ShaderGlobals& sg, const Ray& r,
+void OSL_HOSTDEVICE
+SimpleRaytracer::globals_from_hit(ShaderGlobalsType& sg, const Ray& r,
                                   const Dual2<float>& t, int id, float u,
                                   float v)
 {
+#ifndef __CUDACC__
     memset((char*)&sg, 0, sizeof(ShaderGlobals));
-    Dual2<Vec3> P = r.point(t);
-    // We are missing the projection onto the surface here
-    sg.P             = P.val();
-    sg.dPdx          = P.dx();
-    sg.dPdy          = P.dy();
-    sg.N             = scene.normal(P, sg.Ng, id, u, v);
-    Dual2<Vec2> uv   = scene.uv(P, sg.N, sg.dPdu, sg.dPdv, id, u, v);
-    sg.u             = uv.val().x;
-    sg.dudx          = uv.dx().x;
-    sg.dudy          = uv.dy().x;
-    sg.v             = uv.val().y;
-    sg.dvdx          = uv.dx().y;
-    sg.dvdy          = uv.dy().y;
+#endif
+
+#ifndef __CUDACC__
     const int meshid = std::upper_bound(scene.last_index.begin(),
                                         scene.last_index.end(), id)
                        - scene.last_index.begin();
+#else
+    const int meshid = m_meshids[id];
+#endif
+
+    Dual2<Vec3> P = r.point(t);
+    // We are missing the projection onto the surface here
+    sg.P                  = P.val();
+    sg.dPdx               = P.dx();
+    sg.dPdy               = P.dy();
+    sg.N                  = scene.normal(P, sg.Ng, id, u, v);
+    Dual2<Vec2> uv        = scene.uv(P, sg.N, sg.dPdu, sg.dPdv, id, u, v);
+    sg.u                  = uv.val().x;
+    sg.dudx               = uv.dx().x;
+    sg.dudy               = uv.dy().x;
+    sg.v                  = uv.val().y;
+    sg.dvdx               = uv.dx().y;
+    sg.dvdy               = uv.dy().y;
     sg.surfacearea        = m_mesh_surfacearea[meshid];
     Dual2<Vec3> direction = r.dual_direction();
     sg.I                  = direction.val();
@@ -882,37 +904,55 @@ SimpleRaytracer::globals_from_hit(ShaderGlobals& sg, const Ray& r,
     sg.raytype        = r.raytype;
     sg.flipHandedness = sg.dPdx.cross(sg.dPdy).dot(sg.N) < 0;
 
+#ifndef __CUDACC__
     // In our SimpleRaytracer, the "renderstate" itself just a pointer to
     // the ShaderGlobals.
     sg.renderstate = &sg;
+#endif
 }
 
-Vec3
+
+
+OSL_HOSTDEVICE Vec3
 SimpleRaytracer::eval_background(const Dual2<Vec3>& dir, ShadingContext* ctx,
                                  int bounce)
 {
-    ShaderGlobals sg;
+    ShaderGlobalsType sg;
     memset((char*)&sg, 0, sizeof(ShaderGlobals));
     sg.I    = dir.val();
     sg.dIdx = dir.dx();
     sg.dIdy = dir.dy();
     if (bounce >= 0)
         sg.raytype = bounce > 0 ? Ray::DIFFUSE : Ray::CAMERA;
+#ifndef __CUDACC__
     shadingsys->execute(*ctx, *m_shaders[backgroundShaderID], sg);
-    return process_background_closure(sg.Ci);
+#else
+    alignas(8) char closure_pool[256];
+    execute_shader(sg, render_params.bg_id, closure_pool);
+#endif
+    return process_background_closure((const ClosureColor*)sg.Ci);
 }
 
 Color3
 SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
                                    ShadingContext* ctx)
 {
+#ifdef __CUDACC__
+    // Scratch space for the output closures
+    alignas(8) char closure_pool[256];
+    alignas(8) char light_closure_pool[256];
+#endif
+
     constexpr float inf = std::numeric_limits<float>::infinity();
     Ray r               = camera.get(x, y);
     Color3 path_weight(1, 1, 1);
     Color3 path_radiance(0, 0, 0);
     int prev_id    = -1;
     float bsdf_pdf = inf;  // camera ray has only one possible direction
+
     for (int b = 0; b <= max_bounces; b++) {
+        ShaderGlobalsType sg;
+
         // trace the ray against the scene
         Intersection hit = scene.intersect(r, inf, prev_id);
         if (hit.t == inf) {
@@ -935,8 +975,8 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
         }
 
         // construct a shader globals for the hit point
-        ShaderGlobals sg;
         globals_from_hit(sg, r, hit.t, hit.id, hit.u, hit.v);
+
         if (show_globals) {
             // visualize the main fields of the shader globals
             Vec3 v = sg.Ng;
@@ -954,21 +994,34 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
             path_radiance += path_weight * c;
             break;
         }
+
         const float radius = r.radius + r.spread * hit.t;
-        int shaderID       = scene.shaderid(hit.id);
+
+        int shaderID = scene.shaderid(hit.id);
+
+#ifndef __CUDACC__
         if (shaderID < 0 || !m_shaders[shaderID])
             break;  // no shader attached? done
 
         // execute shader and process the resulting list of closures
         shadingsys->execute(*ctx, *m_shaders[shaderID], sg);
+#else
+        if (shaderID < 0)
+            break;  // no shader attached? done
+        execute_shader(sg, shaderID, closure_pool);
+#endif
         ShadingResult result;
         bool last_bounce = b == max_bounces;
-        process_closure(sg, result, sg.Ci, last_bounce);
+        process_closure(sg, result, (const ClosureColor*)sg.Ci, last_bounce);
+
+#ifndef __CUDACC__
+        const size_t lightprims_size = m_lightprims.size();
+#endif
 
         // add self-emission
         float k = 1;
-        if (m_shader_is_light[shaderID]) {
-            const float light_pick_pdf = 1.0f / m_lightprims.size();
+        if (m_shader_is_light[shaderID] && lightprims_size > 0) {
+            const float light_pick_pdf = 1.0f / lightprims_size;
             // figure out the probability of reaching this point
             float light_pdf = light_pick_pdf
                               * scene.shapepdf(hit.id, r.origin, sg.P);
@@ -1008,6 +1061,7 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
                              * MIS::power_heuristic<MIS::WEIGHT_WEIGHT>(bg_pdf,
                                                                         b.pdf);
             if ((contrib.x + contrib.y + contrib.z) > 0) {
+                ShaderGlobalsType shadow_sg;
                 Ray shadow_ray          = Ray(sg.P, bg_dir.val(), radius, 0,
                                               Ray::SHADOW);
                 Intersection shadow_hit = scene.intersect(shadow_ray, inf,
@@ -1018,17 +1072,18 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
         }
 
         // trace a shadow ray to one of the light emitting primitives
-        if (!m_lightprims.empty()) {
-            const float light_pick_pdf = 1.0f / m_lightprims.size();
+        if (lightprims_size > 0) {
+            const float light_pick_pdf = 1.0f / lightprims_size;
 
             // uniform probability for each light
-            float xl = xi * m_lightprims.size();
+            float xl = xi * lightprims_size;
             int ls   = floorf(xl);
             xl -= ls;
 
             uint32_t lid = m_lightprims[ls];
             if (lid != hit.id) {
                 int shaderID = scene.shaderid(lid);
+
                 // sample a random direction towards the object
                 LightSample sample = scene.sample(lid, sg.P, xl, yi);
                 BSDF::Sample b     = result.bsdf.eval(-sg.I, sample.dir);
@@ -1036,29 +1091,46 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
                                  * MIS::power_heuristic<MIS::EVAL_WEIGHT>(
                                      light_pick_pdf * sample.pdf, b.pdf);
                 if ((contrib.x + contrib.y + contrib.z) > 0) {
+                    ShaderGlobalsType light_sg;
                     Ray shadow_ray = Ray(sg.P, sample.dir, radius, 0,
                                          Ray::SHADOW);
                     // trace a shadow ray and see if we actually hit the target
                     // in this tiny renderer, tracing a ray is probably cheaper than evaluating the light shader
                     Intersection shadow_hit
                         = scene.intersect(shadow_ray, sample.dist, hit.id, lid);
-                    if (shadow_hit.t == sample.dist) {
+
+#ifndef __CUDACC__
+                    const bool did_hit = shadow_hit.t == sample.dist;
+#else
+                    // The hit distance on the device is not as precise as on
+                    // the CPU, so we need to allow a little wiggle room. An
+                    // epsilon of 1e-3f empirically gives results that closely
+                    // match the CPU for the test scenes, so that's what we're
+                    // using.
+                    const bool did_hit = fabsf(shadow_hit.t - sample.dist)
+                                         < 1e-3f;
+#endif
+                    if (did_hit) {
                         // setup a shader global for the point on the light
-                        ShaderGlobals light_sg;
                         globals_from_hit(light_sg, shadow_ray, sample.dist, lid,
                                          sample.u, sample.v);
+#ifndef __CUDACC__
                         // execute the light shader (for emissive closures only)
                         shadingsys->execute(*ctx, *m_shaders[shaderID],
                                             light_sg);
+#else
+                        execute_shader(light_sg, shaderID, light_closure_pool);
+#endif
                         ShadingResult light_result;
-                        process_closure(light_sg, light_result, light_sg.Ci,
-                                        true);
+                        process_closure(light_sg, light_result,
+                                        (const ClosureColor*)light_sg.Ci, true);
                         // accumulate contribution
                         path_radiance += contrib * light_result.Le;
                     }
                 }
             }
         }
+
         // trace indirect ray and continue
         BSDF::Sample p = result.bsdf.sample(-sg.I, xi, yi, zi);
         path_weight *= p.weight;
@@ -1077,14 +1149,15 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler,
     return path_radiance;
 }
 
-Color3
+
+OSL_HOSTDEVICE Color3
 SimpleRaytracer::antialias_pixel(int x, int y, ShadingContext* ctx)
 {
     Color3 result(0, 0, 0);
     for (int si = 0, n = aa * aa; si < n; si++) {
         Sampler sampler(x, y, si);
         // jitter pixel coordinate [0,1)^2
-        Vec3 j = sampler.get();
+        Vec3 j = no_jitter ? Vec3(0.5f, 0.5f, 0) : sampler.get();
         // warp distribution to approximate a tent filter [-1,+1)^2
         j.x *= 2;
         j.x = j.x < 1 ? sqrtf(j.x) - 1 : 1 - sqrtf(2 - j.x);
@@ -1100,11 +1173,13 @@ SimpleRaytracer::antialias_pixel(int x, int y, ShadingContext* ctx)
 }
 
 
+#ifndef __CUDACC__
 void
 SimpleRaytracer::prepare_render()
 {
     // Retrieve and validate options
     aa                = std::max(1, options.get_int("aa"));
+    no_jitter         = options.get_int("no_jitter") != 0;
     max_bounces       = options.get_int("max_bounces");
     rr_depth          = options.get_int("rr_depth");
     show_albedo_scale = options.get_float("show_albedo_scale");
@@ -1132,7 +1207,25 @@ SimpleRaytracer::prepare_render()
 
     // build bvh and prepare triangles
     scene.prepare(errhandler());
+    prepare_lights();
+
+#    if 0
+    // dump scene to disk as obj for debugging purposes
+    // TODO: make this a feature?
+    FILE* fp = fopen("/tmp/test.obj", "w");
+    for (Vec3 v : scene.verts)
+        fprintf(fp, "v %.9g %.9g %.9g\n", v.x, v.y, v.z);
+    for (TriangleIndices t : scene.triangles)
+        fprintf(fp, "f %d %d %d\n", 1 + t.a, 1 + t.b, 1 + t.c);
+    fclose(fp);
+#    endif
+}
+
 
+
+void
+SimpleRaytracer::prepare_lights()
+{
     m_mesh_surfacearea.reserve(scene.last_index.size());
 
     // measure the total surface area of each mesh
@@ -1156,16 +1249,6 @@ SimpleRaytracer::prepare_render()
     if (!m_lightprims.empty())
         errhandler().infofmt("Found {} triangles to be treated as lights",
                              m_lightprims.size());
-#if 0
-    // dump scene to disk as obj for debugging purposes
-    // TODO: make this a feature?
-    FILE* fp = fopen("/tmp/test.obj", "w");
-    for (Vec3 v : scene.verts)
-        fprintf(fp, "v %.9g %.9g %.9g\n", v.x, v.y, v.z);
-    for (TriangleIndices t : scene.triangles)
-        fprintf(fp, "f %d %d %d\n", 1 + t.a, 1 + t.b, 1 + t.c);
-    fclose(fp);
-#endif
 }
 
 
@@ -1213,4 +1296,6 @@ SimpleRaytracer::clear()
     shaders().clear();
 }
 
+#endif  // #ifndef __CUDACC__
+
 OSL_NAMESPACE_EXIT
diff --git a/src/testrender/simpleraytracer.h b/src/testrender/simpleraytracer.h
index 6dadf79cc..09b31380c 100644
--- a/src/testrender/simpleraytracer.h
+++ b/src/testrender/simpleraytracer.h
@@ -80,6 +80,7 @@ class SimpleRaytracer : public RendererServices {
 
     virtual void parse_scene_xml(const std::string& scenefile);
     virtual void prepare_render();
+    virtual void prepare_lights();
     virtual void warmup() {}
     virtual void render(int xres, int yres);
     virtual void clear();
@@ -93,6 +94,7 @@ class SimpleRaytracer : public RendererServices {
     OIIO::ErrorHandler& errhandler() const { return *m_errhandler; }
 
     const std::vector<bool>& shader_is_light() { return m_shader_is_light; }
+    const std::vector<unsigned>& lightprims() { return m_lightprims; }
 
     Camera camera;
     Scene scene;
@@ -101,6 +103,9 @@ class SimpleRaytracer : public RendererServices {
     OIIO::ParamValueList options;
     OIIO::ImageBuf pixelbuf;
 
+    int getBackgroundShaderID() const { return backgroundShaderID; }
+    int getBackgroundResolution() const { return backgroundResolution; }
+
 private:
     // Camera parameters
     Matrix44 m_world_to_camera;
@@ -112,6 +117,7 @@ class SimpleRaytracer : public RendererServices {
     int backgroundShaderID   = -1;
     int backgroundResolution = 1024;
     int aa                   = 1;
+    bool no_jitter           = false;
     int max_bounces          = 1000000;
     int rr_depth             = 5;
     float show_albedo_scale  = 0.0f;
diff --git a/src/testrender/testrender.cpp b/src/testrender/testrender.cpp
index 39f4c2e28..899e3fd2f 100644
--- a/src/testrender/testrender.cpp
+++ b/src/testrender/testrender.cpp
@@ -50,6 +50,7 @@ static std::string extraoptions;
 static std::string texoptions;
 static int xres = 640, yres = 480;
 static int aa = 1, max_bounces = 1000000, rr_depth = 5;
+static bool no_jitter          = false;
 static float show_albedo_scale = 0.0f;
 static int show_globals        = 0;
 static int num_threads         = 0;
@@ -173,6 +174,8 @@ getargs(int argc, const char* argv[])
       .hidden();
     ap.arg("-aa %d:N", &aa)
       .help("Trace NxN rays per pixel");
+    ap.arg("--no-jitter", &no_jitter)
+      .help("Disable AA pixel jitter");
     ap.arg("-albedo %f:SCALE", &show_albedo_scale)
       .help("Visualize the albedo of each pixel instead of path tracing");
     ap.arg("-normals")
@@ -287,6 +290,7 @@ main(int argc, const char* argv[])
     rend->attribute("max_bounces", max_bounces);
     rend->attribute("rr_depth", rr_depth);
     rend->attribute("aa", aa);
+    rend->attribute("no_jitter", (int)no_jitter);
     rend->attribute("show_albedo_scale", show_albedo_scale);
     rend->attribute("show_globals", show_globals);
     OIIO::attribute("threads", num_threads);
diff --git a/src/testshade/CMakeLists.txt b/src/testshade/CMakeLists.txt
index 17b3c3419..e8d72cfc0 100644
--- a/src/testshade/CMakeLists.txt
+++ b/src/testshade/CMakeLists.txt
@@ -16,7 +16,6 @@ if (OSL_USE_OPTIX)
     list (APPEND testshade_srcs optixgridrender.cpp)
     set ( testshade_cuda_srcs
         cuda/optix_grid_renderer.cu
-        ../testrender/cuda/wrapper.cu
         )
 
     set (testshade_rend_lib_srcs
@@ -25,7 +24,9 @@ if (OSL_USE_OPTIX)
         )
 
     set ( testshade_cuda_headers
-        ../testrender/cuda/rend_lib.h )
+        ../testrender/cuda/rend_lib.h
+        ../testrender/raytracer.h
+        )
 
     # We need to make sure that the PTX files are regenerated whenever these
     # headers change.
diff --git a/src/testshade/cuda/optix_grid_renderer.cu b/src/testshade/cuda/optix_grid_renderer.cu
index 609e34057..8d569627a 100644
--- a/src/testshade/cuda/optix_grid_renderer.cu
+++ b/src/testshade/cuda/optix_grid_renderer.cu
@@ -29,7 +29,7 @@ OSL_NAMESPACE_EXIT
 
 
 extern "C" {
-__device__ __constant__ RenderParams render_params;
+__device__ __constant__ testshade::RenderParams render_params;
 }
 
 extern "C" __global__ void
@@ -101,7 +101,7 @@ __raygen__()
     alignas(8) char closure_pool[256];
     alignas(8) char params[256];
 
-    ShaderGlobals sg;
+    OSL_CUDA::ShaderGlobals sg;
     // Setup the ShaderGlobals
     sg.I  = make_float3(0, 0, 1);
     sg.N  = make_float3(0, 0, 1);
@@ -130,7 +130,7 @@ __raygen__()
     sg.backfacing  = 0;
 
     // NB: These variables are not used in the current iteration of the sample
-    sg.raytype        = CAMERA;
+    sg.raytype        = OSL::Ray::CAMERA;
     sg.flipHandedness = 0;
 
     sg.shader2common = reinterpret_cast<void*>(render_params.shader2common);
@@ -143,24 +143,30 @@ __raygen__()
     // Run the OSL group and init functions
     if (render_params.fused_callable)
         // call osl_init_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
-            0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
-            nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
-            0 /*shadeindex - unused*/,
-            sbtdata->data /*interactive_params_ptr*/);
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*,
+                        int, void*>(0u, &sg /*shaderglobals_ptr*/,
+                                    params /*groupdata_ptr*/,
+                                    nullptr /*userdata_base_ptr*/,
+                                    nullptr /*output_base_ptr*/,
+                                    0 /*shadeindex - unused*/,
+                                    sbtdata->data /*interactive_params_ptr*/);
     else {
         // call osl_init_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
-            0u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
-            nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
-            0 /*shadeindex - unused*/,
-            sbtdata->data /*interactive_params_ptr*/);
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*,
+                        int, void*>(0u, &sg /*shaderglobals_ptr*/,
+                                    params /*groupdata_ptr*/,
+                                    nullptr /*userdata_base_ptr*/,
+                                    nullptr /*output_base_ptr*/,
+                                    0 /*shadeindex - unused*/,
+                                    sbtdata->data /*interactive_params_ptr*/);
         // call osl_group_func
-        optixDirectCall<void, ShaderGlobals*, void*, void*, void*, int, void*>(
-            1u, &sg /*shaderglobals_ptr*/, params /*groupdata_ptr*/,
-            nullptr /*userdata_base_ptr*/, nullptr /*output_base_ptr*/,
-            0 /*shadeindex - unused*/,
-            sbtdata->data /*interactive_params_ptr*/);
+        optixDirectCall<void, OSL_CUDA::ShaderGlobals*, void*, void*, void*,
+                        int, void*>(1u, &sg /*shaderglobals_ptr*/,
+                                    params /*groupdata_ptr*/,
+                                    nullptr /*userdata_base_ptr*/,
+                                    nullptr /*output_base_ptr*/,
+                                    0 /*shadeindex - unused*/,
+                                    sbtdata->data /*interactive_params_ptr*/);
     }
 
     float* f_output      = (float*)params;
@@ -171,8 +177,11 @@ __raygen__()
 // Because clang++ 9.0 seems to have trouble with some of the texturing "intrinsics"
 // let's do the texture look-ups in this file.
 extern "C" __device__ float4
-osl_tex2DLookup(void* handle, float s, float t)
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx,
+                float dsdy, float dtdy)
 {
+    const float2 dx           = { dsdx, dtdx };
+    const float2 dy           = { dsdy, dtdy };
     cudaTextureObject_t texID = cudaTextureObject_t(handle);
-    return tex2D<float4>(texID, s, t);
+    return tex2DGrad<float4>(texID, s, t, dx, dy);
 }
diff --git a/src/testshade/optixgridrender.cpp b/src/testshade/optixgridrender.cpp
index 3adb1ff52..877a5691f 100644
--- a/src/testshade/optixgridrender.cpp
+++ b/src/testshade/optixgridrender.cpp
@@ -14,7 +14,7 @@
 #include "render_params.h"
 
 #include <cuda.h>
-#include <nvrtc.h>
+#include <cuda_runtime.h>
 #include <optix_function_table_definition.h>
 #include <optix_stack_size.h>
 #include <optix_stubs.h>
@@ -33,6 +33,9 @@ const auto optixModuleCreateFn = optixModuleCreate;
 #endif
 
 
+using namespace testshade;
+
+
 OSL_NAMESPACE_ENTER
 
 
@@ -83,6 +86,11 @@ OSL_NAMESPACE_ENTER
     }
 
 
+#define DEVICE_ALLOC(size) reinterpret_cast<CUdeviceptr>(device_alloc(size))
+#define COPY_TO_DEVICE(dst_device, src_host, size) \
+    copy_to_device(reinterpret_cast<void*>(dst_device), src_host, size)
+
+
 static void
 context_log_cb(unsigned int level, const char* tag, const char* message,
                void* /*cbdata */)
@@ -177,10 +185,12 @@ OptixGridRenderer::load_ptx_file(string_view filename)
 
 OptixGridRenderer::~OptixGridRenderer()
 {
-    for (void* p : m_ptrs_to_free)
-        cudaFree(p);
     if (m_optix_ctx)
         OPTIX_CHECK(optixDeviceContextDestroy(m_optix_ctx));
+    for (CUdeviceptr ptr : m_ptrs_to_free)
+        cudaFree(reinterpret_cast<void*>(ptr));
+    for (cudaArray_t arr : m_arrays_to_free)
+        cudaFreeArray(arr);
 }
 
 
@@ -247,30 +257,24 @@ OptixGridRenderer::synch_attributes()
         const size_t podDataSize = cpuDataSize
                                    - sizeof(ustringhash) * numStrings;
 
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_color_system),
-                              podDataSize + sizeof(uint64_t) * numStrings));
+        d_color_system = DEVICE_ALLOC(podDataSize
+                                      + sizeof(uint64_t) * numStrings);
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_color_system), colorSys,
                               podDataSize, cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_osl_printf_buffer),
-                              OSL_PRINTF_BUFFER_SIZE));
+        d_osl_printf_buffer = DEVICE_ALLOC(OSL_PRINTF_BUFFER_SIZE);
         CUDA_CHECK(cudaMemset(reinterpret_cast<void*>(d_osl_printf_buffer), 0,
                               OSL_PRINTF_BUFFER_SIZE));
 
         // Transforms
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_object2common),
-                              sizeof(OSL::Matrix44)));
+        d_object2common = DEVICE_ALLOC(sizeof(OSL::Matrix44));
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_object2common),
                               &m_object2common, sizeof(OSL::Matrix44),
                               cudaMemcpyHostToDevice));
-        CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_shader2common),
-                              sizeof(OSL::Matrix44)));
+        d_shader2common = DEVICE_ALLOC(sizeof(OSL::Matrix44));
         CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_shader2common),
                               &m_shader2common, sizeof(OSL::Matrix44),
                               cudaMemcpyHostToDevice));
 
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_color_system));
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_osl_printf_buffer));
-
         // then copy the device string to the end, first strings starting at dataPtr - (numStrings)
         // FIXME -- Should probably handle alignment better.
         const ustringhash* cpuStringHash
@@ -725,26 +729,12 @@ OptixGridRenderer::make_optix_materials()
     setglobals_raygenRecord.data = nullptr;
     setglobals_missRecord.data   = nullptr;
 
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_raygenRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_missRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_hitgroupRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_callablesRecord),
-                          callables * sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_raygenRecord),
-                          sizeof(GenericRecord)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_setglobals_missRecord),
-                          sizeof(GenericRecord)));
-
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_raygenRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_missRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_hitgroupRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_callablesRecord));
-    m_ptrs_to_free.push_back(
-        reinterpret_cast<void*>(d_setglobals_raygenRecord));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_setglobals_missRecord));
+    d_raygenRecord            = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_missRecord              = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_hitgroupRecord          = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_callablesRecord         = DEVICE_ALLOC(callables * sizeof(GenericRecord));
+    d_setglobals_raygenRecord = DEVICE_ALLOC(sizeof(GenericRecord));
+    d_setglobals_missRecord   = DEVICE_ALLOC(sizeof(GenericRecord));
 
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_raygenRecord),
                           &raygenRecord, sizeof(GenericRecord),
@@ -817,53 +807,82 @@ OptixGridRenderer::get_texture_handle(ustring filename,
 {
     auto itr = m_samplers.find(filename);
     if (itr == m_samplers.end()) {
-        // Open image
+        // Open image to check the number of mip levels
         OIIO::ImageBuf image;
         if (!image.init_spec(filename, 0, 0)) {
             errhandler().errorfmt("Could not load: {} (hash {})", filename,
                                   filename);
             return (TextureHandle*)nullptr;
         }
-
-        OIIO::ROI roi = OIIO::get_roi_full(image.spec());
-        int32_t width = roi.width(), height = roi.height();
-        std::vector<float> pixels(width * height * 4);
-
-        for (int j = 0; j < height; j++) {
-            for (int i = 0; i < width; i++) {
-                image.getpixel(i, j, 0, &pixels[((j * width) + i) * 4 + 0]);
-            }
-        }
-        cudaResourceDesc res_desc = {};
+        int32_t nmiplevels = std::max(image.nmiplevels(), 1);
+        int32_t img_width  = image.xmax() + 1;
+        int32_t img_height = image.ymax() + 1;
 
         // hard-code textures to 4 channels
-        int32_t pitch = width * 4 * sizeof(float);
         cudaChannelFormatDesc channel_desc
             = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
 
-        cudaArray_t pixelArray;
-        CUDA_CHECK(cudaMallocArray(&pixelArray, &channel_desc, width, height));
+        cudaMipmappedArray_t mipmapArray;
+        cudaExtent extent = make_cudaExtent(img_width, img_height, 0);
+        CUDA_CHECK(cudaMallocMipmappedArray(&mipmapArray, &channel_desc, extent,
+                                            nmiplevels));
+
+        // Copy the pixel data for each mip level
+        std::vector<std::vector<float>> level_pixels(nmiplevels);
+        for (int32_t level = 0; level < nmiplevels; ++level) {
+            image.reset(filename, 0, level);
+            OIIO::ROI roi = OIIO::get_roi_full(image.spec());
+            if (!roi.defined()) {
+                errhandler().errorfmt(
+                    "Could not load mip level {}: {} (hash {})", level,
+                    filename, filename);
+                return (TextureHandle*)nullptr;
+            }
 
-        m_ptrs_to_free.push_back(reinterpret_cast<void*>(pixelArray));
+            int32_t width = roi.width(), height = roi.height();
+            level_pixels[level].resize(width * height * 4);
+            for (int j = 0; j < height; j++) {
+                for (int i = 0; i < width; i++) {
+                    image.getpixel(i, j, 0,
+                                   &level_pixels[level][((j * width) + i) * 4]);
+                }
+            }
 
-        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray,
-                                       /* offset */ 0, 0, pixels.data(), pitch,
-                                       pitch, height, cudaMemcpyHostToDevice));
+            cudaArray_t miplevelArray;
+            CUDA_CHECK(
+                cudaGetMipmappedArrayLevel(&miplevelArray, mipmapArray, level));
 
-        res_desc.resType         = cudaResourceTypeArray;
-        res_desc.res.array.array = pixelArray;
+            // Copy the texel data into the miplevel array
+            int32_t pitch = width * 4 * sizeof(float);
+            CUDA_CHECK(cudaMemcpy2DToArray(miplevelArray, 0, 0,
+                                           level_pixels[level].data(), pitch,
+                                           pitch, height,
+                                           cudaMemcpyHostToDevice));
+        }
 
-        cudaTextureDesc tex_desc = {};
-        tex_desc.addressMode[0]  = cudaAddressModeWrap;
-        tex_desc.addressMode[1]  = cudaAddressModeWrap;
-        tex_desc.filterMode      = cudaFilterModeLinear;
-        tex_desc.readMode
-            = cudaReadModeElementType;  //cudaReadModeNormalizedFloat;
+        int32_t pitch = img_width * 4 * sizeof(float);
+        cudaArray_t pixelArray;
+        CUDA_CHECK(
+            cudaMallocArray(&pixelArray, &channel_desc, img_width, img_height));
+        CUDA_CHECK(cudaMemcpy2DToArray(pixelArray, 0, 0, level_pixels[0].data(),
+                                       pitch, pitch, img_height,
+                                       cudaMemcpyHostToDevice));
+        m_arrays_to_free.push_back(pixelArray);
+
+        cudaResourceDesc res_desc  = {};
+        res_desc.resType           = cudaResourceTypeMipmappedArray;
+        res_desc.res.mipmap.mipmap = mipmapArray;
+
+        cudaTextureDesc tex_desc     = {};
+        tex_desc.addressMode[0]      = cudaAddressModeWrap;
+        tex_desc.addressMode[1]      = cudaAddressModeWrap;
+        tex_desc.filterMode          = cudaFilterModeLinear;
+        tex_desc.readMode            = cudaReadModeElementType;
         tex_desc.normalizedCoords    = 1;
         tex_desc.maxAnisotropy       = 1;
-        tex_desc.maxMipmapLevelClamp = 99;
+        tex_desc.maxMipmapLevelClamp = float(nmiplevels - 1);
         tex_desc.minMipmapLevelClamp = 0;
-        tex_desc.mipmapFilterMode    = cudaFilterModePoint;
+        tex_desc.mipmapFilterMode    = cudaFilterModeLinear;
         tex_desc.borderColor[0]      = 1.0f;
         tex_desc.sRGB                = 0;
 
@@ -907,13 +926,8 @@ OptixGridRenderer::warmup()
 void
 OptixGridRenderer::render(int xres OSL_MAYBE_UNUSED, int yres OSL_MAYBE_UNUSED)
 {
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_output_buffer),
-                          xres * yres * 4 * sizeof(float)));
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_launch_params),
-                          sizeof(RenderParams)));
-
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_output_buffer));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_launch_params));
+    d_output_buffer = DEVICE_ALLOC(xres * yres * 4 * sizeof(float));
+    d_launch_params = DEVICE_ALLOC(sizeof(RenderParams));
 
     m_xres = xres;
     m_yres = yres;
@@ -1119,19 +1133,15 @@ OptixGridRenderer::register_named_transforms()
     }
 
     // Push the names and transforms to the device
-    size_t sz = sizeof(uint64_t) * xform_name_buffer.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_xform_name_buffer), sz));
+    size_t sz           = sizeof(uint64_t) * xform_name_buffer.size();
+    d_xform_name_buffer = DEVICE_ALLOC(sz);
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_xform_name_buffer),
                           xform_name_buffer.data(), sz,
                           cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_xform_name_buffer));
-
-    sz = sizeof(OSL::Matrix44) * xform_buffer.size();
-    CUDA_CHECK(cudaMalloc(reinterpret_cast<void**>(&d_xform_buffer), sz));
+    sz             = sizeof(OSL::Matrix44) * xform_buffer.size();
+    d_xform_buffer = DEVICE_ALLOC(sz);
     CUDA_CHECK(cudaMemcpy(reinterpret_cast<void*>(d_xform_buffer),
                           xform_buffer.data(), sz, cudaMemcpyHostToDevice));
-    m_ptrs_to_free.push_back(reinterpret_cast<void*>(d_xform_buffer));
-
     m_num_named_xforms = xform_name_buffer.size();
 }
 
diff --git a/src/testshade/optixgridrender.h b/src/testshade/optixgridrender.h
index 5f630f922..9cb2719fd 100644
--- a/src/testshade/optixgridrender.h
+++ b/src/testshade/optixgridrender.h
@@ -104,7 +104,8 @@ class OptixGridRenderer final : public SimpleRenderer {
     OSL::Matrix44 m_object2common;  // "object" space to "common" space matrix
 
     // CUdeviceptrs that need to be freed after we are done
-    std::vector<void*> m_ptrs_to_free;
+    std::vector<CUdeviceptr> m_ptrs_to_free;
+    std::vector<cudaArray_t> m_arrays_to_free;
 };
 
 
diff --git a/src/testshade/render_params.h b/src/testshade/render_params.h
index c2e9c26c9..6670fe099 100644
--- a/src/testshade/render_params.h
+++ b/src/testshade/render_params.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+namespace testshade {
+
 struct RenderParams {
     float invw;
     float invh;
@@ -40,3 +42,5 @@ struct GenericRecord {
     // What follows should duplicate GenericData
     void* data;
 };
+
+}  // namespace testshade
diff --git a/src/testshade/rs_simplerend.cpp b/src/testshade/rs_simplerend.cpp
index 0cf9b8ca8..13c3ce45c 100644
--- a/src/testshade/rs_simplerend.cpp
+++ b/src/testshade/rs_simplerend.cpp
@@ -188,7 +188,8 @@ rs_transform_points(OSL::OpaqueExecContextPtr /*ec*/, OSL::ustringhash /*from*/,
 // doesn't know how to handle CUDA texture intrinsics. This function must be
 // defined in the CUDA source for testshade and testrender.
 extern "C" __device__ float4
-osl_tex2DLookup(void* handle, float s, float t);
+osl_tex2DLookup(void* handle, float s, float t, float dsdx, float dtdx,
+                float dsdy, float dtdy);
 #endif
 
 OSL_RSOP OSL_HOSTDEVICE bool
@@ -208,7 +209,8 @@ rs_texture(OSL::OpaqueExecContextPtr ec, OSL::ustringhash filename,
 #else
     if (!texture_handle)
         return false;
-    const float4 fromTexture = osl_tex2DLookup((void*)texture_handle, s, t);
+    const float4 fromTexture = osl_tex2DLookup(texture_handle, s, t, dsdx, dtdx,
+                                               dsdy, dtdy);
     *((float3*)result)       = make_float3(fromTexture.x, fromTexture.y,
                                            fromTexture.z);
     return true;
diff --git a/testsuite/render-background/OPTIX b/testsuite/render-background/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-background/OPTIX_OPTIMIZEONLY b/testsuite/render-background/OPTIX_OPTIMIZEONLY
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-background/ref/out-optix-alt.exr b/testsuite/render-background/ref/out-optix-alt.exr
new file mode 100644
index 000000000..452ced070
Binary files /dev/null and b/testsuite/render-background/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-background/run.py b/testsuite/render-background/run.py
index 0da23d621..b3122b811 100755
--- a/testsuite/render-background/run.py
+++ b/testsuite/render-background/run.py
@@ -7,6 +7,8 @@
 failthresh = 0.01
 failpercent = 1
 hardfail = 0.11
+allowfailures = 5
+idiff_program = "idiff"
 
 outputs = [ "out.exr" ]
 command = testrender("-r 320 240 -aa 4 scene.xml out.exr")
diff --git a/testsuite/render-bumptest/OPTIX b/testsuite/render-bumptest/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-bumptest/ref/out-optix-alt.exr b/testsuite/render-bumptest/ref/out-optix-alt.exr
new file mode 100644
index 000000000..3371916bd
Binary files /dev/null and b/testsuite/render-bumptest/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-bunny/OPTIX b/testsuite/render-bunny/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-bunny/ref/out-optix-alt.exr b/testsuite/render-bunny/ref/out-optix-alt.exr
new file mode 100644
index 000000000..db5e84a67
Binary files /dev/null and b/testsuite/render-bunny/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-cornell/OPTIX b/testsuite/render-cornell/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-cornell/ref/out-optix-alt.exr b/testsuite/render-cornell/ref/out-optix-alt.exr
new file mode 100644
index 000000000..d5a2355b2
Binary files /dev/null and b/testsuite/render-cornell/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-furnace-diffuse/OPTIX b/testsuite/render-furnace-diffuse/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr b/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr
new file mode 100644
index 000000000..dce1374be
Binary files /dev/null and b/testsuite/render-furnace-diffuse/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-material-layer/OPTIX b/testsuite/render-material-layer/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-microfacet/OPTIX b/testsuite/render-microfacet/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-microfacet/OPTIX_OPTIMIZEONLY b/testsuite/render-microfacet/OPTIX_OPTIMIZEONLY
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-microfacet/ref/out-optix-alt.exr b/testsuite/render-microfacet/ref/out-optix-alt.exr
new file mode 100644
index 000000000..d6bc304bf
Binary files /dev/null and b/testsuite/render-microfacet/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-microfacet/run.py b/testsuite/render-microfacet/run.py
index cf55f64ff..1dac3ca98 100755
--- a/testsuite/render-microfacet/run.py
+++ b/testsuite/render-microfacet/run.py
@@ -6,6 +6,8 @@
 
 failthresh = 0.02
 failpercent = 1
+allowfailures = 5
+idiff_program = "idiff"
 
 outputs = [ "out.exr" ]
 command = testrender("-r 320 240 -aa 8 scene.xml out.exr")
diff --git a/testsuite/render-mx-burley-diffuse/OPTIX b/testsuite/render-mx-burley-diffuse/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr b/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr
new file mode 100644
index 000000000..5ef99dd1c
Binary files /dev/null and b/testsuite/render-mx-burley-diffuse/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-conductor/OPTIX b/testsuite/render-mx-conductor/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-conductor/ref/out-optix-alt.exr b/testsuite/render-mx-conductor/ref/out-optix-alt.exr
new file mode 100644
index 000000000..b97249a52
Binary files /dev/null and b/testsuite/render-mx-conductor/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-conductor/run.py b/testsuite/render-mx-conductor/run.py
index b01c78cc6..ba901ef83 100755
--- a/testsuite/render-mx-conductor/run.py
+++ b/testsuite/render-mx-conductor/run.py
@@ -7,6 +7,8 @@
 failthresh = 0.01
 failpercent = 1
 hardfail = 0.025
+allowfailures = 5
+idiff_program = "idiff"
 
 outputs = [ "out.exr" ]
 command = testrender("-v -r 320 240 -aa 16 scene.xml out.exr")
diff --git a/testsuite/render-mx-dielectric-glass/OPTIX b/testsuite/render-mx-dielectric-glass/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr b/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr
new file mode 100644
index 000000000..a123df055
Binary files /dev/null and b/testsuite/render-mx-dielectric-glass/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-dielectric/OPTIX b/testsuite/render-mx-dielectric/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-dielectric/ref/out-optix-alt.exr b/testsuite/render-mx-dielectric/ref/out-optix-alt.exr
new file mode 100644
index 000000000..527bdf88d
Binary files /dev/null and b/testsuite/render-mx-dielectric/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-furnace-burley-diffuse/OPTIX b/testsuite/render-mx-furnace-burley-diffuse/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr
new file mode 100644
index 000000000..55a223787
Binary files /dev/null and b/testsuite/render-mx-furnace-burley-diffuse/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-furnace-oren-nayar/OPTIX b/testsuite/render-mx-furnace-oren-nayar/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr
new file mode 100644
index 000000000..9e7a50566
Binary files /dev/null and b/testsuite/render-mx-furnace-oren-nayar/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-furnace-sheen/OPTIX b/testsuite/render-mx-furnace-sheen/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr b/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr
new file mode 100644
index 000000000..610f308c6
Binary files /dev/null and b/testsuite/render-mx-furnace-sheen/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-generalized-schlick-glass/OPTIX b/testsuite/render-mx-generalized-schlick-glass/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr b/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr
new file mode 100644
index 000000000..0cdbd7065
Binary files /dev/null and b/testsuite/render-mx-generalized-schlick-glass/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-generalized-schlick/OPTIX b/testsuite/render-mx-generalized-schlick/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr b/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr
new file mode 100644
index 000000000..d9d54c51e
Binary files /dev/null and b/testsuite/render-mx-generalized-schlick/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-layer/OPTIX b/testsuite/render-mx-layer/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-layer/ref/out-optix-alt.exr b/testsuite/render-mx-layer/ref/out-optix-alt.exr
new file mode 100644
index 000000000..185e9c72d
Binary files /dev/null and b/testsuite/render-mx-layer/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-layer/run.py b/testsuite/render-mx-layer/run.py
index 7688fd939..62f9b4aae 100755
--- a/testsuite/render-mx-layer/run.py
+++ b/testsuite/render-mx-layer/run.py
@@ -6,5 +6,8 @@
 
 failthresh = 0.01
 failpercent = 1
+allowfailures = 5
+idiff_program = "idiff"
+
 outputs = [ "out.exr" ]
 command = testrender("-r 320 240 -aa 6 scene.xml out.exr")
diff --git a/testsuite/render-mx-sheen/OPTIX b/testsuite/render-mx-sheen/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-mx-sheen/ref/out-optix-alt.exr b/testsuite/render-mx-sheen/ref/out-optix-alt.exr
new file mode 100644
index 000000000..c1e337907
Binary files /dev/null and b/testsuite/render-mx-sheen/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-mx-sheen/run.py b/testsuite/render-mx-sheen/run.py
index 7688fd939..62f9b4aae 100755
--- a/testsuite/render-mx-sheen/run.py
+++ b/testsuite/render-mx-sheen/run.py
@@ -6,5 +6,8 @@
 
 failthresh = 0.01
 failpercent = 1
+allowfailures = 5
+idiff_program = "idiff"
+
 outputs = [ "out.exr" ]
 command = testrender("-r 320 240 -aa 6 scene.xml out.exr")
diff --git a/testsuite/render-oren-nayar/OPTIX b/testsuite/render-oren-nayar/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db b/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db
new file mode 100644
index 000000000..feb40a1c6
Binary files /dev/null and b/testsuite/render-oren-nayar/OPTIX_CACHE/optix7cache.db differ
diff --git a/testsuite/render-oren-nayar/ref/out-optix-alt.exr b/testsuite/render-oren-nayar/ref/out-optix-alt.exr
new file mode 100644
index 000000000..ee707b950
Binary files /dev/null and b/testsuite/render-oren-nayar/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-raytypes/OPTIX b/testsuite/render-raytypes/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-raytypes/ref/out-optix-alt.exr b/testsuite/render-raytypes/ref/out-optix-alt.exr
new file mode 100644
index 000000000..d9703c510
Binary files /dev/null and b/testsuite/render-raytypes/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-uv/OPTIX b/testsuite/render-uv/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-uv/OPTIX_OPTIMIZEONLY b/testsuite/render-uv/OPTIX_OPTIMIZEONLY
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-uv/ref/out-optix-alt.exr b/testsuite/render-uv/ref/out-optix-alt.exr
new file mode 100644
index 000000000..90f5567ee
Binary files /dev/null and b/testsuite/render-uv/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-veachmis/OPTIX b/testsuite/render-veachmis/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-veachmis/ref/out-optix-alt.exr b/testsuite/render-veachmis/ref/out-optix-alt.exr
new file mode 100644
index 000000000..120ee76a7
Binary files /dev/null and b/testsuite/render-veachmis/ref/out-optix-alt.exr differ
diff --git a/testsuite/render-ward/OPTIX b/testsuite/render-ward/OPTIX
new file mode 100644
index 000000000..e69de29bb
diff --git a/testsuite/render-ward/ref/out-optix-alt.exr b/testsuite/render-ward/ref/out-optix-alt.exr
new file mode 100644
index 000000000..97a08c9d9
Binary files /dev/null and b/testsuite/render-ward/ref/out-optix-alt.exr differ
diff --git a/testsuite/testoptix-noise/ref/out.exr b/testsuite/testoptix-noise/ref/out.exr
index 193abb522..8de6deae5 100644
Binary files a/testsuite/testoptix-noise/ref/out.exr and b/testsuite/testoptix-noise/ref/out.exr differ
diff --git a/testsuite/testoptix-noise/ref/out_02.exr b/testsuite/testoptix-noise/ref/out_02.exr
index 185c6f046..b4070fd73 100644
Binary files a/testsuite/testoptix-noise/ref/out_02.exr and b/testsuite/testoptix-noise/ref/out_02.exr differ
diff --git a/testsuite/testoptix-noise/run.py b/testsuite/testoptix-noise/run.py
index 865b73183..3a787e3dd 100755
--- a/testsuite/testoptix-noise/run.py
+++ b/testsuite/testoptix-noise/run.py
@@ -7,5 +7,5 @@
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
 outputs  = [ "out.exr", "out_02.exr" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
-command += testrender("-optix -res 320 240 scene_02.xml out_02.exr")
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
+command += testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene_02.xml out_02.exr")
diff --git a/testsuite/testoptix-reparam/ref/out.exr b/testsuite/testoptix-reparam/ref/out.exr
index fc1006e04..b8eb11c8e 100644
Binary files a/testsuite/testoptix-reparam/ref/out.exr and b/testsuite/testoptix-reparam/ref/out.exr differ
diff --git a/testsuite/testoptix-reparam/run.py b/testsuite/testoptix-reparam/run.py
index ef900eae8..b196355b0 100755
--- a/testsuite/testoptix-reparam/run.py
+++ b/testsuite/testoptix-reparam/run.py
@@ -7,4 +7,4 @@
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
 outputs  = [ "out.exr" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
diff --git a/testsuite/testoptix/ref/out.exr b/testsuite/testoptix/ref/out.exr
index b6e346156..9f6bc6fa8 100644
Binary files a/testsuite/testoptix/ref/out.exr and b/testsuite/testoptix/ref/out.exr differ
diff --git a/testsuite/testoptix/ref/test_microfacet_dist.exr b/testsuite/testoptix/ref/test_microfacet_dist.exr
deleted file mode 100644
index 95655f1d8..000000000
Binary files a/testsuite/testoptix/ref/test_microfacet_dist.exr and /dev/null differ
diff --git a/testsuite/testoptix/ref/test_spline.exr b/testsuite/testoptix/ref/test_spline.exr
index 3b4c9a374..4405d8f4e 100644
Binary files a/testsuite/testoptix/ref/test_spline.exr and b/testsuite/testoptix/ref/test_spline.exr differ
diff --git a/testsuite/testoptix/ref/test_texture.exr b/testsuite/testoptix/ref/test_texture.exr
index 53a882084..22b17905b 100644
Binary files a/testsuite/testoptix/ref/test_texture.exr and b/testsuite/testoptix/ref/test_texture.exr differ
diff --git a/testsuite/testoptix/run.py b/testsuite/testoptix/run.py
index 833915c27..78148d793 100755
--- a/testsuite/testoptix/run.py
+++ b/testsuite/testoptix/run.py
@@ -6,15 +6,14 @@
 
 failthresh = 0.03   # allow a little more LSB noise between platforms
 failpercent = .5
-outputs  = [ "out.exr", "test_microfacet_dist.exr", "test_texture.exr", "test_spline.exr", "out.txt" ]
-command  = testrender("-optix -res 320 240 scene.xml out.exr")
-command += testrender("-optix -res 320 240 test_microfacet_dist.xml test_microfacet_dist.exr")
-command += testrender("-optix -res 1 1 test_print.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_compare.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_assign.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_assign_02.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_str_ops.xml dummy.exr")
-command += testrender("-optix -res 1 1 test_userdata_string.xml dummy.exr")
+outputs  = [ "out.exr", "test_texture.exr", "test_spline.exr", "out.txt" ]
+command  = testrender("-optix -res 320 240 -no-jitter -albedo 1.0 scene.xml out.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_print.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_compare.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_assign_02.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_str_ops.xml dummy.exr")
+command += testrender("-optix -res 1 1 -no-jitter -albedo 1.0 test_userdata_string.xml dummy.exr")
 
 command += testshade("-optix -res 256 256 test_spline -o Cout test_spline.exr")
 command += testshade("-optix -res 512 512 test_texture -o Cout test_texture.exr")