chellmuth · chellmuth · Feb 4, 2025 · chellmuth · Feb 4, 2025 · chellmuth
diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h
@@ -595,6 +595,19 @@ class OSLEXECPUBLIC RendererServices {
         }
     };
 
+    virtual bool optix_cache_enabled() const { return false; }
+
+    virtual void optix_cache_insert(const std::string& key,
+                                    const std::string& value) const
+    {
+    }
+
+    virtual bool optix_cache_get(const std::string& key,
+                                 std::string& value) const
+    {
+        return false;
+    }
+
     /// A renderer may choose to support batched execution by providing pointers
     /// to objects satisfying the BatchedRendererServices<WidthOf<#>> interface
     /// for specific batch sizes.

diff --git a/src/liboslexec/instance.cpp b/src/liboslexec/instance.cpp
@@ -848,13 +848,35 @@ ShaderGroup::setup_interactive_arena(cspan<uint8_t> paramblock)
 
 
 
+std::string
+ShaderGroup::generate_optix_cache_key()
+{
+    const uint64_t ir_key = Strutil::strhash(serialize_internal());
+
+    std::string safegroup;
+    safegroup = Strutil::replace(name(), "/", "_", true);
+    safegroup = Strutil::replace(safegroup, ":", "_", true);
+
+    std::string cache_key = fmtformat("cache-osl-{}-{}", safegroup, ir_key);
+
+    m_optix_cache_key = cache_key;
+    return m_optix_cache_key;
+}
+
 std::string
 ShaderGroup::serialize() const
+{
+    lock_guard lock(m_mutex);
+    return serialize_internal();
+}
+
+std::string
+ShaderGroup::serialize_internal() const
 {
     std::ostringstream out;
     out.imbue(std::locale::classic());  // force C locale
     out.precision(9);
-    lock_guard lock(m_mutex);
+
     for (int i = 0, nl = nlayers(); i < nl; ++i) {
         const ShaderInstance* inst = m_layers[i].get();
 

diff --git a/src/liboslexec/llvm_instance.cpp b/src/liboslexec/llvm_instance.cpp
@@ -2150,6 +2150,7 @@ BackendLLVM::run()
         ll.prune_and_internalize_module(external_functions);
     }
 
+
     // Debug code to dump the pre-optimized bitcode to a file
     if (llvm_debug() >= 2 || shadingsys().llvm_output_bitcode()) {
         // Make a safe group name that doesn't have "/" in it! Also beware
@@ -2281,6 +2282,13 @@ BackendLLVM::run()
                 group().llvm_compiled_layer(nlayers - 1));
     }
 
+    if (use_optix() && renderer()->optix_cache_enabled()) {
+        std::string cache_key = group().optix_cache_key();
+        renderer()->optix_cache_insert(
+            cache_key, optix_cache_wrap(group().m_llvm_ptx_compiled_version,
+                                        group().llvm_groupdata_size()));
+    }
+
     // We are destroying the entire module below,
     // no reason to bother destroying individual functions
 #if 0

diff --git a/src/liboslexec/oslexec.cpp b/src/liboslexec/oslexec.cpp
@@ -50,6 +50,28 @@ shadertype_from_name(string_view name)
     return ShaderType::Unknown;
 }
 
+std::string
+optix_cache_wrap(const std::string& ptx, size_t groupdata_size)
+{
+    // Cache string is the ptx file with groupdata size on top as a comment.
+    // This way the cache string is a valid ptx program, which can be useful
+    // for debugging.
+    return fmtformat("// {}\n{}", groupdata_size, ptx);
+}
+
+void
+optix_cache_unwrap(const std::string& cache_value, std::string& ptx,
+                   size_t& groupdata_size)
+{
+    size_t groupdata_end_index = cache_value.find('\n');
+    if (groupdata_end_index != std::string::npos) {
+        std::string groupdata_string = cache_value.substr(3, groupdata_end_index
+                                                                 - 3);
+        groupdata_size               = std::stoll(groupdata_string);
+
+        ptx = cache_value.substr(groupdata_end_index + 1);
+    }
+}
 
 };  // namespace pvt
 OSL_NAMESPACE_EXIT
diff --git a/src/liboslexec/oslexec_pvt.h b/src/liboslexec/oslexec_pvt.h
@@ -78,6 +78,12 @@ struct PerThreadInfo {
 
 namespace pvt {
 
+void
+optix_cache_unwrap(const std::string& cache_value, std::string& ptx,
+                   size_t& groupdata_size);
+std::string
+optix_cache_wrap(const std::string& ptx, size_t groupdata_size);
+
 // forward definitions
 class ShadingSystemImpl;
 class ShaderInstance;
@@ -1829,6 +1835,9 @@ class ShaderGroup {
     void name(ustring name) { m_name = name; }
     ustring name() const { return m_name; }
 
+    std::string generate_optix_cache_key();
+    std::string optix_cache_key() const { return m_optix_cache_key; }
+
     std::string serialize() const;
 
     void lock() const { m_mutex.lock(); }
@@ -1965,6 +1974,8 @@ class ShaderGroup {
     }
 
 private:
+    std::string serialize_internal() const;
+
     // Put all the things that are read-only (after optimization) and
     // needed on every shade execution at the front of the struct, as much
     // together on one cache line as possible.
@@ -2016,6 +2027,8 @@ class ShaderGroup {
     atomic_ll m_executions { 0 };  ///< Number of times the group executed
     atomic_ll m_stat_total_shading_time_ticks { 0 };  // Shading time (ticks)
 
+    std::string m_optix_cache_key;
+
     // PTX assembly for compiled ShaderGroup
     std::string m_llvm_ptx_compiled_version;
 

diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp
@@ -3774,6 +3774,7 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
         ctx           = get_context(thread_info);
         ctx_allocated = true;
     }
+
     if (!group.optimized()) {
         RuntimeOptimizer rop(*this, group, ctx);
         rop.run();
@@ -3823,34 +3824,49 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx,
     }
 
     if (need_jit) {
-        BackendLLVM lljitter(*this, group, ctx);
-        lljitter.run();
-
-        // NOTE: it is now possible to optimize and not JIT
-        // which would leave the cleanup to happen
-        // when the ShadingSystem is destroyed
-
-        // Only cleanup when are not batching or if
-        // the batch jit has already happened,
-        // as it requires the ops so we can't delete them yet!
-        if (((renderer()->batched(WidthOf<16>()) == nullptr)
-             && (renderer()->batched(WidthOf<8>()) == nullptr)
-             && (renderer()->batched(WidthOf<4>()) == nullptr))
-            || group.batch_jitted()) {
-            group_post_jit_cleanup(group);
+        bool cached = false;
+        if (use_optix() && renderer()->optix_cache_enabled()) {
+            std::string cache_key = group.generate_optix_cache_key();
+
+            std::string cache_value;
+            if (renderer()->optix_cache_get(cache_key, cache_value)) {
+                cached = true;
+                optix_cache_unwrap(cache_value,
+                                   group.m_llvm_ptx_compiled_version,
+                                   group.m_llvm_groupdata_size);
+            }
         }
 
-        group.m_jitted = true;
-        spin_lock stat_lock(m_stat_mutex);
-        m_stat_opt_locking_time += locking_time;
-        m_stat_optimization_time += timer();
-        m_stat_total_llvm_time += lljitter.m_stat_total_llvm_time;
-        m_stat_llvm_setup_time += lljitter.m_stat_llvm_setup_time;
-        m_stat_llvm_irgen_time += lljitter.m_stat_llvm_irgen_time;
-        m_stat_llvm_opt_time += lljitter.m_stat_llvm_opt_time;
-        m_stat_llvm_jit_time += lljitter.m_stat_llvm_jit_time;
-        m_stat_max_llvm_local_mem = std::max(m_stat_max_llvm_local_mem,
-                                             lljitter.m_llvm_local_mem);
+        if (!cached) {
+            BackendLLVM lljitter(*this, group, ctx);
+            lljitter.run();
+
+            // NOTE: it is now possible to optimize and not JIT
+            // which would leave the cleanup to happen
+            // when the ShadingSystem is destroyed
+
+            // Only cleanup when are not batching or if
+            // the batch jit has already happened,
+            // as it requires the ops so we can't delete them yet!
+            if (((renderer()->batched(WidthOf<16>()) == nullptr)
+                 && (renderer()->batched(WidthOf<8>()) == nullptr)
+                 && (renderer()->batched(WidthOf<4>()) == nullptr))
+                || group.batch_jitted()) {
+                group_post_jit_cleanup(group);
+            }
+
+            group.m_jitted = true;
+            spin_lock stat_lock(m_stat_mutex);
+            m_stat_opt_locking_time += locking_time;
+            m_stat_optimization_time += timer();
+            m_stat_total_llvm_time += lljitter.m_stat_total_llvm_time;
+            m_stat_llvm_setup_time += lljitter.m_stat_llvm_setup_time;
+            m_stat_llvm_irgen_time += lljitter.m_stat_llvm_irgen_time;
+            m_stat_llvm_opt_time += lljitter.m_stat_llvm_opt_time;
+            m_stat_llvm_jit_time += lljitter.m_stat_llvm_jit_time;
+            m_stat_max_llvm_local_mem = std::max(m_stat_max_llvm_local_mem,
+                                                 lljitter.m_llvm_local_mem);
+        }
     }
 
     if (ctx_allocated) {