diff --git a/src/include/OSL/rendererservices.h b/src/include/OSL/rendererservices.h index 62a6b6179..0083219ac 100644 --- a/src/include/OSL/rendererservices.h +++ b/src/include/OSL/rendererservices.h @@ -595,6 +595,19 @@ class OSLEXECPUBLIC RendererServices { } }; + virtual bool optix_cache_enabled() const { return false; } + + virtual void optix_cache_insert(const std::string& key, + const std::string& value) const + { + } + + virtual bool optix_cache_get(const std::string& key, + std::string& value) const + { + return false; + } + /// A renderer may choose to support batched execution by providing pointers /// to objects satisfying the BatchedRendererServices> interface /// for specific batch sizes. diff --git a/src/liboslexec/instance.cpp b/src/liboslexec/instance.cpp index 46a4a6129..5b00e4833 100644 --- a/src/liboslexec/instance.cpp +++ b/src/liboslexec/instance.cpp @@ -848,13 +848,35 @@ ShaderGroup::setup_interactive_arena(cspan paramblock) +std::string +ShaderGroup::generate_optix_cache_key() +{ + const uint64_t ir_key = Strutil::strhash(serialize_internal()); + + std::string safegroup; + safegroup = Strutil::replace(name(), "/", "_", true); + safegroup = Strutil::replace(safegroup, ":", "_", true); + + std::string cache_key = fmtformat("cache-osl-{}-{}", safegroup, ir_key); + + m_optix_cache_key = cache_key; + return m_optix_cache_key; +} + std::string ShaderGroup::serialize() const +{ + lock_guard lock(m_mutex); + return serialize_internal(); +} + +std::string +ShaderGroup::serialize_internal() const { std::ostringstream out; out.imbue(std::locale::classic()); // force C locale out.precision(9); - lock_guard lock(m_mutex); + for (int i = 0, nl = nlayers(); i < nl; ++i) { const ShaderInstance* inst = m_layers[i].get(); diff --git a/src/liboslexec/llvm_instance.cpp b/src/liboslexec/llvm_instance.cpp index 368ed4c38..9059deab7 100644 --- a/src/liboslexec/llvm_instance.cpp +++ b/src/liboslexec/llvm_instance.cpp @@ -2150,6 +2150,7 @@ BackendLLVM::run() ll.prune_and_internalize_module(external_functions); } + // Debug code to dump the pre-optimized bitcode to a file if (llvm_debug() >= 2 || shadingsys().llvm_output_bitcode()) { // Make a safe group name that doesn't have "/" in it! Also beware @@ -2281,6 +2282,13 @@ BackendLLVM::run() group().llvm_compiled_layer(nlayers - 1)); } + if (use_optix() && renderer()->optix_cache_enabled()) { + std::string cache_key = group().optix_cache_key(); + renderer()->optix_cache_insert( + cache_key, optix_cache_wrap(group().m_llvm_ptx_compiled_version, + group().llvm_groupdata_size())); + } + // We are destroying the entire module below, // no reason to bother destroying individual functions #if 0 diff --git a/src/liboslexec/oslexec.cpp b/src/liboslexec/oslexec.cpp index 9cef337b2..de5c09832 100644 --- a/src/liboslexec/oslexec.cpp +++ b/src/liboslexec/oslexec.cpp @@ -50,6 +50,28 @@ shadertype_from_name(string_view name) return ShaderType::Unknown; } +std::string +optix_cache_wrap(const std::string& ptx, size_t groupdata_size) +{ + // Cache string is the ptx file with groupdata size on top as a comment. + // This way the cache string is a valid ptx program, which can be useful + // for debugging. + return fmtformat("// {}\n{}", groupdata_size, ptx); +} + +void +optix_cache_unwrap(const std::string& cache_value, std::string& ptx, + size_t& groupdata_size) +{ + size_t groupdata_end_index = cache_value.find('\n'); + if (groupdata_end_index != std::string::npos) { + std::string groupdata_string = cache_value.substr(3, groupdata_end_index + - 3); + groupdata_size = std::stoll(groupdata_string); + + ptx = cache_value.substr(groupdata_end_index + 1); + } +} }; // namespace pvt OSL_NAMESPACE_EXIT diff --git a/src/liboslexec/oslexec_pvt.h b/src/liboslexec/oslexec_pvt.h index 74efc5d3a..3cc3de9c8 100644 --- a/src/liboslexec/oslexec_pvt.h +++ b/src/liboslexec/oslexec_pvt.h @@ -78,6 +78,12 @@ struct PerThreadInfo { namespace pvt { +void +optix_cache_unwrap(const std::string& cache_value, std::string& ptx, + size_t& groupdata_size); +std::string +optix_cache_wrap(const std::string& ptx, size_t groupdata_size); + // forward definitions class ShadingSystemImpl; class ShaderInstance; @@ -1829,6 +1835,9 @@ class ShaderGroup { void name(ustring name) { m_name = name; } ustring name() const { return m_name; } + std::string generate_optix_cache_key(); + std::string optix_cache_key() const { return m_optix_cache_key; } + std::string serialize() const; void lock() const { m_mutex.lock(); } @@ -1965,6 +1974,8 @@ class ShaderGroup { } private: + std::string serialize_internal() const; + // Put all the things that are read-only (after optimization) and // needed on every shade execution at the front of the struct, as much // together on one cache line as possible. @@ -2016,6 +2027,8 @@ class ShaderGroup { atomic_ll m_executions { 0 }; ///< Number of times the group executed atomic_ll m_stat_total_shading_time_ticks { 0 }; // Shading time (ticks) + std::string m_optix_cache_key; + // PTX assembly for compiled ShaderGroup std::string m_llvm_ptx_compiled_version; diff --git a/src/liboslexec/shadingsys.cpp b/src/liboslexec/shadingsys.cpp index 0a96599fa..ba1f41a6a 100644 --- a/src/liboslexec/shadingsys.cpp +++ b/src/liboslexec/shadingsys.cpp @@ -3774,6 +3774,7 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, ctx = get_context(thread_info); ctx_allocated = true; } + if (!group.optimized()) { RuntimeOptimizer rop(*this, group, ctx); rop.run(); @@ -3823,34 +3824,49 @@ ShadingSystemImpl::optimize_group(ShaderGroup& group, ShadingContext* ctx, } if (need_jit) { - BackendLLVM lljitter(*this, group, ctx); - lljitter.run(); - - // NOTE: it is now possible to optimize and not JIT - // which would leave the cleanup to happen - // when the ShadingSystem is destroyed - - // Only cleanup when are not batching or if - // the batch jit has already happened, - // as it requires the ops so we can't delete them yet! - if (((renderer()->batched(WidthOf<16>()) == nullptr) - && (renderer()->batched(WidthOf<8>()) == nullptr) - && (renderer()->batched(WidthOf<4>()) == nullptr)) - || group.batch_jitted()) { - group_post_jit_cleanup(group); + bool cached = false; + if (use_optix() && renderer()->optix_cache_enabled()) { + std::string cache_key = group.generate_optix_cache_key(); + + std::string cache_value; + if (renderer()->optix_cache_get(cache_key, cache_value)) { + cached = true; + optix_cache_unwrap(cache_value, + group.m_llvm_ptx_compiled_version, + group.m_llvm_groupdata_size); + } } - group.m_jitted = true; - spin_lock stat_lock(m_stat_mutex); - m_stat_opt_locking_time += locking_time; - m_stat_optimization_time += timer(); - m_stat_total_llvm_time += lljitter.m_stat_total_llvm_time; - m_stat_llvm_setup_time += lljitter.m_stat_llvm_setup_time; - m_stat_llvm_irgen_time += lljitter.m_stat_llvm_irgen_time; - m_stat_llvm_opt_time += lljitter.m_stat_llvm_opt_time; - m_stat_llvm_jit_time += lljitter.m_stat_llvm_jit_time; - m_stat_max_llvm_local_mem = std::max(m_stat_max_llvm_local_mem, - lljitter.m_llvm_local_mem); + if (!cached) { + BackendLLVM lljitter(*this, group, ctx); + lljitter.run(); + + // NOTE: it is now possible to optimize and not JIT + // which would leave the cleanup to happen + // when the ShadingSystem is destroyed + + // Only cleanup when are not batching or if + // the batch jit has already happened, + // as it requires the ops so we can't delete them yet! + if (((renderer()->batched(WidthOf<16>()) == nullptr) + && (renderer()->batched(WidthOf<8>()) == nullptr) + && (renderer()->batched(WidthOf<4>()) == nullptr)) + || group.batch_jitted()) { + group_post_jit_cleanup(group); + } + + group.m_jitted = true; + spin_lock stat_lock(m_stat_mutex); + m_stat_opt_locking_time += locking_time; + m_stat_optimization_time += timer(); + m_stat_total_llvm_time += lljitter.m_stat_total_llvm_time; + m_stat_llvm_setup_time += lljitter.m_stat_llvm_setup_time; + m_stat_llvm_irgen_time += lljitter.m_stat_llvm_irgen_time; + m_stat_llvm_opt_time += lljitter.m_stat_llvm_opt_time; + m_stat_llvm_jit_time += lljitter.m_stat_llvm_jit_time; + m_stat_max_llvm_local_mem = std::max(m_stat_max_llvm_local_mem, + lljitter.m_llvm_local_mem); + } } if (ctx_allocated) {