fix a bunch of compile errors

chilo-ms · chilo-ms · commit 731ed7207ae4 · 2025-07-10T18:29:05.000-07:00
diff --git a/plugin_execution_providers/tensorrt/cuda_allocator.h b/plugin_execution_providers/tensorrt/cuda_allocator.h
@@ -13,12 +13,14 @@ constexpr const char* CUDA_PINNED_ALLOCATOR = "CudaPinned";
 using DeviceId = int16_t;
 
 struct CUDAAllocator : OrtAllocator {
-  CUDAAllocator(DeviceId device_id, const char* name = CUDA_ALLOCATOR) {
+  CUDAAllocator(const OrtMemoryInfo* mem_info, const char* name = CUDA_ALLOCATOR) {
     OrtAllocator::version = ORT_API_VERSION;
     OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<CUDAAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<CUDAAllocator*>(this_)->Free(p); };
     OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const CUDAAllocator*>(this_)->Info(); };
 
+    mem_info_ = mem_info;
+
     device_id_ = device_id;
 
     const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
@@ -44,7 +46,7 @@ struct CUDAAllocator : OrtAllocator {
   void SetDevice(bool throw_when_fail) const;
 
   DeviceId device_id_;
-  OrtMemoryInfo* mem_info_ = nullptr;
+  const OrtMemoryInfo* mem_info_ = nullptr;
 };
 
 struct CUDAPinnedAllocator : OrtAllocator {
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc
@@ -1156,7 +1156,7 @@ OrtStatus* TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(OrtEp* this
     weight_stripped_engine_refit_ = true;
   }
 
-  std::unique_ptr<nvinfer1::IHostMemory> serialized_engine = nullptr;
+  std::unique_ptr<nvinfer1::IHostMemory> serialized_engine;
 
   if (!has_dynamic_shape) {
     std::string timing_cache_path = "";
@@ -1258,7 +1258,7 @@ OrtStatus* TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(OrtEp* this
         }
 
         serialized_engine =
-            std::make_unique<nvinfer1::IHostMemory>(trt_builder->buildSerializedNetwork(*trt_network, *trt_config));
+            std::unique_ptr<nvinfer1::IHostMemory>(trt_builder->buildSerializedNetwork(*trt_network, *trt_config));
 
         if (serialized_engine == nullptr) {
           std::string err_msg = "TensorRT EP failed to create engine from network for fused node: " + fused_node_name;
@@ -1390,32 +1390,9 @@ OrtStatus* TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(OrtEp* this
   input_shape_ranges_[fused_node_name] = input_implicit_shape_ranges;
   profiles_.emplace(fused_node_name, std::move(trt_profiles));
 
-  /*
-  // For dynamic shape input model, firstly TRT EP creates a model proto which includes inputs, outputs and empty
-  // engine. TRT EP will serialize the model at inference time due to engine can be updated and the updated engine
-  // should be included in the model. However, if the embed_mode is 0 (only includes engine path), TRT EP will serialize
-  // it here.
-  if (dump_ep_context_model_ && has_dynamic_shape) {
-    // "ep_cache_context" node attribute should be a relative path to context model directory
-    if (ep_cache_context_attr_.empty()) {
-      auto cache_file_name = std::filesystem::path(engine_cache_path).filename();
-      ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir)
-                                   .append(cache_file_name.string())
-                                   .string();
-    }
-    std::string compute_capability_hw_compat = compute_capability_;
-    if (engine_cache_enable_ && engine_hw_compatible_) {
-      compute_capability_hw_compat = "80+";
-    }
-    model_proto_.reset(CreateCtxModel(graph_body_viewer, ep_cache_context_attr_, nullptr, 0, ep_context_embed_mode_,
-                                      compute_capability_hw_compat, model_path_, GetLogger()));
-    if (ep_context_embed_mode_ == 0) {
-      DumpCtxModel(model_proto_.get(), ctx_model_path_);
-    }
-  }
-  */
 
-  std::unique_ptr<EPContextNodeHelper> ep_ctx_node_helper = std::make_unique<EPContextNodeHelper>(graph, fused_node);
+  // Create EP Context nodes
+  std::unique_ptr<EPContextNodeHelper> ep_ctx_node_helper = std::make_unique<EPContextNodeHelper>(*ep, graph, fused_node);
   if (dump_ep_context_model_) {
     std::string compute_capability_hw_compat = compute_capability_;
     if (engine_cache_enable_ && engine_hw_compatible_) {
@@ -1490,6 +1467,8 @@ OrtStatus* TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(OrtEp* this
                  engine_hw_compatible_,
                  sync_stream_after_enqueue_};
 
+  ep->compute_states_[fused_node_name] = std::move(compute_state);
+
   // Update the OrtNodeComputeInfo associated with the graph.
   auto ep_node_compute_info = std::make_unique<TRTEpNodeComputeInfo>(*ep);
   *node_compute_info = ep_node_compute_info.release();
@@ -1554,10 +1533,10 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProvider::GetCapabilityImpl(OrtEp* this
       auto supported_control_flow_op = [&](const OrtNode* node) {
         OrtStatus* status = nullptr;
         size_t num_subgraphs = 0;
-        RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Node_GetNumSubgraphs(node, &num_subgraphs), ort_api);
+        RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Node_GetNumSubgraphs(node, &num_subgraphs));
 
         std::vector<const OrtGraph*> node_subgraphs(num_subgraphs);
-        RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Node_GetSubgraphs(node, node_subgraphs.data(), node_subgraphs.size(), nullptr), ort_api);
+        RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Node_GetSubgraphs(node, node_subgraphs.data(), node_subgraphs.size(), nullptr));
 
         
         // Iterate the node's subgraphs
@@ -1566,7 +1545,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProvider::GetCapabilityImpl(OrtEp* this
 
           // Get number of subgraph's nodes
           size_t num_subgraph_nodes = 0;
-          RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Graph_GetNumNodes(subgraph, &num_subgraph_nodes), ort_api);
+          RETURN_FALSE_AND_PRINT_IF_ERROR(ort_api.Graph_GetNumNodes(subgraph, &num_subgraph_nodes));
           
           // TRT EP should consider the empty subgraph is fully supported by TRT.
           if (num_subgraph_nodes == 0) {
@@ -1926,13 +1905,11 @@ OrtStatus* TensorrtExecutionProvider::RefitEngine(
 /// </summary>
 TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFactory& factory,
                                                      const std::string& name,
-                                                     const OrtHardwareDevice& device,
                                                      const OrtSessionOptions& session_options,
                                                      const OrtLogger& logger)
     : ApiPtrs{static_cast<const ApiPtrs&>(factory)},
       factory_(factory),
       name_{name},
-      hardware_device_{device},
       session_options_{session_options},
       logger_{logger} {
 
@@ -2176,7 +2153,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFa
    *  Please refer to ParserProfileShapes() for more details)
    *
    */
-  bool status = true;
+  // bool status = true;
   // if (status) {
   //     status = ParseProfileShapes(profile_min_shapes, profile_min_shapes_);
   //     if (!status) {
@@ -2266,14 +2243,14 @@ OrtStatus* TRTEpNodeComputeInfo::CreateStateImpl(OrtNodeComputeInfo* this_ptr, O
   TensorrtExecutionProvider& ep = node_compute_info->ep;
   
   std::string fused_node_name = ep.ep_api.NodeComputeContext_NodeName(compute_context);
-  auto state_it = ep.GetComputeStates().find(fused_node_name);
-  if (state_it == ep.GetComputeStates().end()) {
+  auto state_it = ep.compute_states_.find(fused_node_name);
+  if (state_it == ep.compute_states_.end()) {
     std::string message = "Unable to TensorRT EP's compute state for fused node with name " + fused_node_name;
     return ep.ort_api.CreateStatus(ORT_EP_FAIL, message.c_str());
   }
 
-  TensorrtComputeState& compute_state = *state_it->second;
-  *compute_state = &compute_state;
+  TensorrtComputeState& trt_ep_compute_state = *state_it->second;
+  *compute_state = &trt_ep_compute_state;
   return nullptr;
 }
 
@@ -2335,7 +2312,7 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void*
   bool context_update = false;
   std::unordered_set<std::string> input_names;
 
-  std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps = ep.GetDDSOutputAllocators();
+  std::unordered_map<std::string, DDSOutputAllocatorMap>& dds_output_allocator_maps = ep.GetDDSOutputAllocators();
   auto& dds_output_allocator_map = dds_output_allocator_maps[fused_node_name];
   
   // Get default OrtMemoryInfo from factory
@@ -2911,7 +2888,7 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void*
 
 void TRTEpNodeComputeInfo::ReleaseStateImpl(OrtNodeComputeInfo* this_ptr, void* compute_state) {
   (void)this_ptr;
-  TensorrtComputeState& compute_state = *reinterpret_cast<TensorrtComputeState*>(compute_state);
-  (void)compute_state;
+  TensorrtComputeState& trt_ep_compute_state = *reinterpret_cast<TensorrtComputeState*>(compute_state);
+  (void)trt_ep_compute_state;
   // Do nothing for here.
 }
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h
@@ -13,6 +13,7 @@
 #include <string>
 #include <unordered_set>
 #include <mutex>
+#include <gsl/span>
 
 #ifdef _WIN32
 #define EXPORT_API __declspec(dllexport)
@@ -231,16 +232,18 @@ static const std::string k_ep_ctx_onnx_model_filename = "onnx_model_filename";
 /// </summary>
 struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   TensorrtExecutionProvider(TensorrtExecutionProviderFactory& factory, const std::string& name,
-                            const OrtHardwareDevice& device, const OrtSessionOptions& session_options,
+                            const OrtSessionOptions& session_options,
                             const OrtLogger& logger);
   ~TensorrtExecutionProvider();
 
   TensorrtExecutionProviderFactory& factory_;
   std::string name_;
-  const OrtHardwareDevice& hardware_device_;
   const OrtSessionOptions& session_options_;
   const OrtLogger& logger_;
 
+  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeState>> compute_states_;
+  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeStateForEPContext>> compute_states_for_ep_context_;
+
   SubGraphCollection_t GetSupportedList(SubGraphCollection_t supported_nodes_list, int iterations,
                                         const int max_iterations, const OrtGraph* graph, bool* early_termination) const;
 
@@ -262,12 +265,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
                                     nvinfer1::ICudaEngine* trt_engine, bool serialize_refitted_engine,
                                     bool detailed_build_log);
 
-  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeState>>& GetComputeStates() { return compute_states_; }
-
-  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeState>>& GetComputeStatesForEPContext() {
-    return compute_states_;
-  }
-
   void GetAllocator(OrtAllocator** alloc) const { *alloc = alloc_; }
 
   void SetAllocator(OrtAllocator* alloc) { alloc_ = alloc; }
@@ -415,9 +412,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
   std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps_;
 
-  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeState>> compute_states_;
-  std::unordered_map<std::string, std::unique_ptr<TensorrtComputeStateForEPContext>> compute_states_for_ep_context;
-
   // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
   //  cudnnHandle_t external_cudnn_handle_ = nullptr;
   //  cublasHandle_t external_cublas_handle_ = nullptr;
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_utils.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -82,6 +82,15 @@ std::string ComposeString(Args&&... args) {
     }                                                \
   } while (0)
 
+#define RETURN_FALSE_AND_PRINT_IF_ERROR(fn)                            \
+  do {                                                                 \
+    OrtStatus* status = (fn);                                          \
+    if (status != nullptr) {                                           \
+      std::cerr << Ort::GetApi().GetErrorMessage(status) << std::endl; \
+      return false;                                                    \
+    }                                                                  \
+  } while (0)  
+
 // Helper to release Ort one or more objects obtained from the public C API at the end of their scope.
 template <typename T>
 struct DeferOrtRelease {
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc
@@ -31,7 +31,7 @@ TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* e
 
   // Default GPU allocator OrtMemoryInfo 
   OrtMemoryInfo* mem_info = nullptr;
-  auto* status = ort_api.CreateMemoryInfo_V2("ExampleEP GPU", OrtMemoryInfoDeviceType_GPU,
+  auto* status = ort_api.CreateMemoryInfo_V2("Cuda", OrtMemoryInfoDeviceType_GPU,
                                        /*vendor*/ 0x10DE, /* device_id */ 0, OrtDeviceMemoryType_DEFAULT,
                                        /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info);
   assert(status == nullptr);  // should never fail.
@@ -40,7 +40,7 @@ TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* e
   // CUDA PINNED allocator OrtMemoryInfo
   // HOST_ACCESSIBLE memory should use the non-CPU device type
   mem_info = nullptr;
-  status = ort_api.CreateMemoryInfo_V2("ExampleEP GPU pinned", OrtMemoryInfoDeviceType_GPU,
+  status = ort_api.CreateMemoryInfo_V2("CudaPinned", OrtMemoryInfoDeviceType_GPU,
                                        /*vendor*/ 0x10DE, /* device_id */ 0, OrtDeviceMemoryType_HOST_ACCESSIBLE,
                                        /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info);
   assert(status == nullptr);  // should never fail.
@@ -56,12 +56,12 @@ TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* e
   data_transfer_impl_.reset();  // but we're CPU only so we return nullptr for the IDataTransfer.
 }
 
-const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetNameImpl(const OrtEpFactory* this_ptr) {
+const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetNameImpl(const OrtEpFactory* this_ptr) noexcept {
   const auto* factory = static_cast<const TensorrtExecutionProviderFactory*>(this_ptr);
   return factory->ep_name_.c_str();
 }
 
-const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetVendorImpl(const OrtEpFactory* this_ptr) {
+const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetVendorImpl(const OrtEpFactory* this_ptr) noexcept {
   const auto* factory = static_cast<const TensorrtExecutionProviderFactory*>(this_ptr);
   return factory->vendor_.c_str();
 }
@@ -72,7 +72,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp
                                                        size_t num_devices,
                                                        OrtEpDevice** ep_devices,
                                                        size_t max_ep_devices,
-                                                       size_t* p_num_ep_devices) {
+                                                       size_t* p_num_ep_devices) noexcept {
   size_t& num_ep_devices = *p_num_ep_devices;
   auto* factory = static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
 
@@ -133,8 +133,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateEpImpl(
                                             _In_reads_(num_devices) const OrtKeyValuePairs* const* /*ep_metadata*/,
                                             _In_ size_t num_devices,
                                             _In_ const OrtSessionOptions* session_options,
-                                            _In_ const OrtLogger* logger,
-                                            _Out_ OrtEp** ep) {
+                                            _In_ const OrtLogger* logger, _Out_ OrtEp** ep) noexcept {
   auto* factory = static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
   *ep = nullptr;
 
@@ -161,7 +160,7 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateEpImpl(
   return nullptr;
 }
 
-void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp* ep) {
+void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory* /*this_ptr*/, OrtEp* ep) noexcept {
   TensorrtExecutionProvider* trt_ep = static_cast<TensorrtExecutionProvider*>(ep);
   delete trt_ep;
 }
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "tensorrt_execution_provider_utils.h"
 #include "tensorrt_execution_provider_data_transfer.h"
 
diff --git a/plugin_execution_providers/tensorrt/utils/ort_graph_to_proto.h b/plugin_execution_providers/tensorrt/utils/ort_graph_to_proto.h
@@ -81,7 +81,7 @@
 #define INCLUDE_ONNXRUNTIME_CORE_PROVIDERS_UTILS_ORT_GRAPH_TO_PROTO_H_
 
 #include <functional>
-#include "core/session/onnxruntime_cxx_api.h"
+#include "onnxruntime_cxx_api.h"
 #include "onnx/onnx_pb.h"
 
 namespace OrtEpUtils {

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#pragma once`
	`2`	`+`
`1`	`3`	`#include "tensorrt_execution_provider_utils.h"`
`2`	`4`	`#include "tensorrt_execution_provider_data_transfer.h"`
`3`	`5`