@@ -655,9 +655,9 @@ void NvExecutionProvider::PerThreadContext::ResetTensorRTContext(std::string fus
655655 }
656656}
657657
658- bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext (std::string fused_node, std::unique_ptr<nvinfer1::IExecutionContext> context) {
658+ bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext (std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context) {
659659 if (!context) {
660- context = std::make_unique<nvinfer1::IExecutionContext> ();
660+ context = tensorrt_ptr::unique_pointer_exec_ctx ();
661661 }
662662 trt_context_map_[fused_node] = std::move (context);
663663
@@ -758,11 +758,11 @@ bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string f
758758nvinfer1::IExecutionContext& NvExecutionProvider::PerThreadContext::GetTensorRTContext (std::string fused_node) {
759759 auto it = trt_context_map_.find (fused_node);
760760 if (it != trt_context_map_.end ()) {
761- return *(it->second ); // dereference shared pointer
761+ return *(it->second . get () ); // dereference shared pointer
762762 }
763- auto context = std::make_unique<nvinfer1::IExecutionContext> ();
763+ auto context = tensorrt_ptr::unique_pointer_exec_ctx ();
764764 trt_context_map_[fused_node] = std::move (context);
765- return *(trt_context_map_[fused_node]); // dereference shared pointer
765+ return *(trt_context_map_[fused_node]. get () ); // dereference shared pointer
766766}
767767
768768void NvExecutionProvider::ReleasePerThreadContext () const {
@@ -871,6 +871,20 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
871871 max_shared_mem_size_ = info.max_shared_mem_size ;
872872 dump_subgraphs_ = info.dump_subgraphs ;
873873 weight_stripped_engine_enable_ = info.weight_stripped_engine_enable ;
874+ // make runtime cache path absolute and create directory if it doesn't exist
875+ if (!info.runtime_cache_path .empty ()) {
876+ std::filesystem::path p (info.runtime_cache_path );
877+ std::filesystem::path abs_path = std::filesystem::absolute (p);
878+ const auto & env = GetDefaultEnv ();
879+ auto status = env.CreateFolder (abs_path.string ());
880+ if (!status.IsOK ()) {
881+ LOGS_DEFAULT (WARNING) << " [NvTensorRTRTX EP] The runtime cache directory could not be created at: " << abs_path
882+ << " . Runtime cache is disabled." ;
883+ } else {
884+ runtime_cache_ = abs_path;
885+ }
886+ }
887+
874888 onnx_model_folder_path_ = info.onnx_model_folder_path ;
875889 onnx_model_bytestream_ = info.onnx_bytestream ;
876890 onnx_model_bytestream_size_ = info.onnx_bytestream_size ;
@@ -1054,7 +1068,8 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
10541068 << " , nv_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_
10551069 << " , nv_onnx_external_bytestream_size_: " << onnx_external_data_bytestream_size_
10561070 << " , nv_use_external_data_initializer_: " << use_external_data_initializer_
1057- << " , nv_op_types_to_exclude: " << op_types_to_exclude_;
1071+ << " , nv_op_types_to_exclude: " << op_types_to_exclude_
1072+ << " , nv_runtime_cache_path: " << runtime_cache_;
10581073}
10591074
10601075Status NvExecutionProvider::Sync () const {
@@ -2637,8 +2652,10 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
26372652 //
26382653 // Otherwise engine will be handled at inference time.
26392654 std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
2640- std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
2655+ tensorrt_ptr::unique_pointer_exec_ctx trt_context;
2656+ std::unique_ptr<nvinfer1::IRuntimeCache> trt_runtime_cache;
26412657 std::unique_ptr<nvinfer1::IRuntimeConfig> trt_runtime_config;
2658+ std::string runtime_cache_file = " " ;
26422659
26432660 // Generate file name for dumping ep context model
26442661 if (dump_ep_context_model_ && ctx_model_path_.empty ()) {
@@ -2667,6 +2684,18 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
26672684 trt_runtime_config->setDynamicShapesKernelSpecializationStrategy (nvinfer1::DynamicShapesKernelSpecializationStrategy::kEAGER );
26682685 }
26692686 trt_runtime_config->setExecutionContextAllocationStrategy (nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED );
2687+ if (!runtime_cache_.empty ()) {
2688+ runtime_cache_file = (runtime_cache_ / fused_node.Name ()).string ();
2689+ trt_runtime_cache = std::unique_ptr<nvinfer1::IRuntimeCache>(trt_runtime_config->createRuntimeCache ());
2690+ auto cache_data = file_utils::ReadFile (runtime_cache_file);
2691+ if (!trt_runtime_cache->deserialize (cache_data.data (), cache_data.size ())) {
2692+ trt_runtime_cache = std::unique_ptr<nvinfer1::IRuntimeCache>(trt_runtime_config->createRuntimeCache ());
2693+ LOGS_DEFAULT (INFO) << " TensorRT RTX failed to deserialize the runtime cache, will overwrite with new one" << std::endl;
2694+ }
2695+ if (!trt_runtime_config->setRuntimeCache (*trt_runtime_cache)) {
2696+ LOGS_DEFAULT (INFO) << " TensorRT RTX failed to set the runtime cache" << std::endl;
2697+ }
2698+ }
26702699
26712700 if (detailed_build_log_) {
26722701 auto engine_build_stop = std::chrono::steady_clock::now ();
@@ -2727,7 +2756,9 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
27272756 // Build context
27282757 // Note: Creating an execution context from an engine is thread safe per TRT doc
27292758 // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
2730- trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext (trt_runtime_config.get ()));
2759+ trt_context = tensorrt_ptr::unique_pointer_exec_ctx (
2760+ trt_engine->createExecutionContext (trt_runtime_config.get ()),
2761+ tensorrt_ptr::IExecutionContextDeleter (runtime_cache_file, std::move (trt_runtime_cache)));
27312762 if (!trt_context) {
27322763 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL,
27332764 " NvTensorRTRTX EP could not build execution context for fused node: " + fused_node.Name ());
@@ -3008,7 +3039,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
30083039 std::unordered_map<std::string, size_t >& output_map,
30093040 std::vector<NodeComputeInfo>& node_compute_funcs) {
30103041 std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
3011- std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
3042+ tensorrt_ptr::unique_pointer_exec_ctx trt_context;
30123043 std::unordered_map<std::string, size_t > input_indexes; // TRT engine input name -> ORT kernel context input index
30133044 std::unordered_map<std::string, size_t > output_indexes; // TRT engine output name -> ORT kernel context output index
30143045 std::unordered_map<std::string, size_t > output_types; // TRT engine output name -> ORT output tensor type
@@ -3030,11 +3061,33 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
30303061 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL, status.ErrorMessage ());
30313062 }
30323063
3064+ std::unique_ptr<nvinfer1::IRuntimeCache> trt_runtime_cache;
3065+ auto trt_runtime_config = std::unique_ptr<nvinfer1::IRuntimeConfig>(trt_engine->createRuntimeConfig ());
3066+ if (trt_runtime_config && cuda_graph_enable_) {
3067+ trt_runtime_config->setDynamicShapesKernelSpecializationStrategy (nvinfer1::DynamicShapesKernelSpecializationStrategy::kEAGER );
3068+ }
3069+ trt_runtime_config->setExecutionContextAllocationStrategy (nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED );
3070+ std::string runtime_cache_file = " " ;
3071+ if (!runtime_cache_.empty ()) {
3072+ runtime_cache_file = (runtime_cache_ / graph_body_viewer.GetNode (node_idx)->Name ()).string ();
3073+ trt_runtime_cache = std::unique_ptr<nvinfer1::IRuntimeCache>(trt_runtime_config->createRuntimeCache ());
3074+ auto cache_data = file_utils::ReadFile (runtime_cache_file);
3075+ if (!trt_runtime_cache->deserialize (cache_data.data (), cache_data.size ())) {
3076+ trt_runtime_cache = std::unique_ptr<nvinfer1::IRuntimeCache>(trt_runtime_config->createRuntimeCache ());
3077+ LOGS_DEFAULT (INFO) << " TensorRT RTX failed to deserialize the runtime cache, will overwrite with new one" << std::endl;
3078+ }
3079+ if (!trt_runtime_config->setRuntimeCache (*trt_runtime_cache)) {
3080+ LOGS_DEFAULT (INFO) << " TensorRT RTX failed to set the runtime cache" << std::endl;
3081+ }
3082+ }
3083+
30333084 // Build context
30343085 //
30353086 // Note: Creating an execution context from an engine is thread safe per TRT doc
30363087 // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
3037- trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext (nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED ));
3088+ trt_context = tensorrt_ptr::unique_pointer_exec_ctx (
3089+ trt_engine->createExecutionContext (trt_runtime_config.get ()),
3090+ tensorrt_ptr::IExecutionContextDeleter (runtime_cache_file, std::move (trt_runtime_cache)));
30383091 if (!trt_context) {
30393092 return ORT_MAKE_STATUS (ONNXRUNTIME, EP_FAIL,
30403093 " NvTensorRTRTX EP could not build execution context for fused node: " + fused_node.Name ());
0 commit comments