fix mem leak for OrtAllocator

chilo-ms · chilo-ms · commit 5f46b688b65c · 2025-08-11T21:42:57.000-07:00
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc
@@ -2071,11 +2071,15 @@ OrtStatus* TensorrtExecutionProvider::RefitEngine(
 #endif
 }
 
-TensorrtExecutionProvider::~TensorrtExecutionProvider() = default;
+TensorrtExecutionProvider::~TensorrtExecutionProvider() {
+  if (alloc_ != nullptr) {
+    ort_api.ReleaseAllocator(alloc_);
+  }
+}
 
 /// <summary>
 /// 
-/// Plugin TensorRT EP that implements OrtEp
+/// Plugin TensorRT EP implementing OrtEp
 /// 
 /// </summary>
 TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFactory& factory,
@@ -2494,18 +2498,17 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void*
   auto& dds_output_allocator_map = dds_output_allocator_maps[fused_node_name];
   
   // Get default OrtMemoryInfo from factory
-  // Get allocator from OrtKernelContext
   const OrtMemoryInfo* mem_info = nullptr;
-  if (ep.factory_.device_id_to_cuda_gpu_memory_info_map.find(device_id) !=
-      ep.factory_.device_id_to_cuda_gpu_memory_info_map.end()) {
-    mem_info = ep.factory_.device_id_to_cuda_gpu_memory_info_map[device_id];
+  if (ep.factory_.cuda_gpu_memory_infos.find(device_id) !=
+      ep.factory_.cuda_gpu_memory_infos.end()) {
+    mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get();
   }
-  OrtAllocator* alloc = nullptr;
-  ep.GetAllocator(&alloc);
-  if (alloc == nullptr) {
-    Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &alloc));
-    ep.SetAllocator(alloc);
+
+  // Get allocator from OrtKernelContext
+  if (ep.alloc_ == nullptr) {
+    Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &ep.alloc_));
   }
+  OrtAllocator* alloc = ep.alloc_;
 
   void* cuda_stream;
   Ort::ThrowOnError(ep.ort_api.KernelContext_GetGPUComputeStream(kernel_context, &cuda_stream));
@@ -3134,18 +3137,17 @@ OrtStatus* TRTEpEpContextNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_p
   std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
 
   // Get default OrtMemoryInfo from factory
-  // Get allocator from OrtKernelContext
   const OrtMemoryInfo* mem_info = nullptr;
-  if (ep.factory_.device_id_to_cuda_gpu_memory_info_map.find(device_id) !=
-      ep.factory_.device_id_to_cuda_gpu_memory_info_map.end()) {
-    mem_info = ep.factory_.device_id_to_cuda_gpu_memory_info_map[device_id];
+  if (ep.factory_.cuda_gpu_memory_infos.find(device_id) !=
+      ep.factory_.cuda_gpu_memory_infos.end()) {
+    mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get();
   }
-  OrtAllocator* alloc = nullptr;
-  ep.GetAllocator(&alloc);
-  if (alloc == nullptr) {
-    Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &alloc));
-    ep.SetAllocator(alloc);
+
+  // Get allocator from OrtKernelContext
+  if (ep.alloc_ == nullptr) {
+    Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &ep.alloc_));
   }
+  OrtAllocator* alloc = ep.alloc_;
 
   void* cuda_stream;
   Ort::ThrowOnError(ep.ort_api.KernelContext_GetGPUComputeStream(kernel_context, &cuda_stream));
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h
@@ -264,10 +264,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
                                     const void* onnx_model_bytestream, size_t onnx_model_bytestream_size,
                                     nvinfer1::ICudaEngine* trt_engine, bool serialize_refitted_engine,
                                     bool detailed_build_log);
-
-  void GetAllocator(OrtAllocator** alloc) const { *alloc = alloc_; }
-
-  void SetAllocator(OrtAllocator* alloc) { alloc_ = alloc; }
  
   std::unordered_map<std::string, DDSOutputAllocatorMap>& GetDDSOutputAllocators() {
     return dds_output_allocator_maps_;
@@ -314,6 +310,10 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   bool external_stream_ = false;
   cudaStream_t stream_ = nullptr;
 
+  // The OrtAllocator object will be get during ep compute time
+  // and should be kept for the lifetime of TRT EP object.
+  OrtAllocator* alloc_ = nullptr;
+
  private:
   static const char* ORT_API_CALL GetNameImpl(const OrtEp* this_ptr) noexcept;
   static OrtStatus* ORT_API_CALL GetCapabilityImpl(OrtEp* this_ptr, const OrtGraph* graph,
@@ -375,10 +375,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
   std::string cache_prefix_;
   bool engine_hw_compatible_ = false;
 
-  // The OrtAllocator object will be get during ep compute time
-  // and should be kept for the lifetime of TRT EP object.
-  OrtAllocator* alloc_ = nullptr;
-
   // For create/dump EP context node model
   bool dump_ep_context_model_ = false;
   std::string ep_context_file_path_;
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc
@@ -57,7 +57,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_
                                                 /* device_id */ device_id, OrtDeviceMemoryType_DEFAULT,
                                                 /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info));
 
-    cuda_gpu_memory_infos.emplace_back(MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo));
+    cuda_gpu_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo);
 
     // HOST_ACCESSIBLE memory should use the non-CPU device type
     mem_info = nullptr;
@@ -66,7 +66,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_
                                                 /* device_id */ device_id, OrtDeviceMemoryType_HOST_ACCESSIBLE,
                                                 /*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info));
 
-    cuda_pinned_memory_infos.emplace_back(MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo));
+    cuda_pinned_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo);
   }
 
   return nullptr;
@@ -196,35 +196,50 @@ void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory*
   delete trt_ep;
 }
 
-OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(
-                                                              OrtEpFactory* this_ptr, const OrtMemoryInfo* memory_info,
-                                                              const OrtKeyValuePairs* /*allocator_options*/,
-                                                              OrtAllocator** allocator) noexcept {
+OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr,
+                                                                              const OrtMemoryInfo* memory_info,
+                                                                              const OrtKeyValuePairs* /*allocator_options*/,
+                                                                              OrtAllocator** allocator) noexcept {
   auto& factory = *static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
-  *allocator = nullptr;
 
-  // NOTE: The factory implementation can return a shared OrtAllocator* instead of creating a new instance on each call.
-  //       To do this just make ReleaseAllocatorImpl a no-op.
+  // NOTE: The factory implementation is free to return a shared OrtAllocator* instance instead of creating a new
+  //       allocator on each call. To do this have an allocator instance as an OrtEpFactory class member and make
+  //       ReleaseAllocatorImpl a no-op.
 
-  // NOTE: If OrtMemoryInfo has allocator type (call MemoryInfoGetType) of OrtArenaAllocator, an ORT BFCArena
-  //       will be added to wrap the returned OrtAllocator. The EP is free to implement its own arena, and if it
-  //       wants to do this the OrtMemoryInfo MUST be created with an allocator type of OrtDeviceAllocator.
-
-  // NOTE: The OrtMemoryInfo pointer should only ever be coming straight from an OrtEpDevice, and pointer based
-  // matching should work.
+  // NOTE: EP should implement its own arena logic. ep_arena.cc/h is provided as a reference and we use it here for
+  //       device memory. `allocator_options` can be used for arena configuration and there is a helper in ep_arena.h
+  //       to convert from OrtKeyValuePairs to the same arena config settings that ORT uses.
+  //       You are of course free to have completely different settings.
   
   const OrtMemoryDevice* mem_device = factory.ep_api.MemoryInfo_GetMemoryDevice(memory_info);
   uint32_t device_id = factory.ep_api.MemoryDevice_GetDeviceId(mem_device);
 
   if (factory.ep_api.MemoryDevice_GetMemoryType(mem_device) == OrtDeviceMemoryType_DEFAULT) {
+    // use the one that previously created
+    if (factory.cuda_gpu_allocators.find(device_id) != factory.cuda_gpu_allocators.end()) {
+      *allocator = factory.cuda_gpu_allocators[device_id].get();
+      return nullptr;
+    }
+
     // create a CUDA allocator
     auto cuda_allocator = std::make_unique<CUDAAllocator>(memory_info, static_cast<DeviceId>(device_id));
-    factory.device_id_to_cuda_gpu_memory_info_map[device_id] = memory_info;
-    *allocator = cuda_allocator.release();
+
+    *allocator = cuda_allocator.get();
+    factory.cuda_gpu_allocators[device_id] = std::move(cuda_allocator);
+   
   } else if (factory.ep_api.MemoryDevice_GetMemoryType(mem_device) == OrtDeviceMemoryType_HOST_ACCESSIBLE) {
+    // use the one that previously created
+    if (factory.cuda_pinned_allocators.find(device_id) != factory.cuda_pinned_allocators.end()) {
+      *allocator = factory.cuda_pinned_allocators[device_id].get();
+      return nullptr;
+    }
+
     // create a CUDA PINNED allocator
     auto cuda_pinned_allocator = std::make_unique<CUDAPinnedAllocator>(memory_info);
-    *allocator = cuda_pinned_allocator.release();
+
+    *allocator = cuda_pinned_allocator.get();
+    factory.cuda_pinned_allocators[device_id] = std::move(cuda_pinned_allocator);
+
   } else {
     return factory.ort_api.CreateStatus(ORT_INVALID_ARGUMENT,
                                         "INTERNAL ERROR! Unknown memory info provided to CreateAllocator. "
@@ -236,7 +251,8 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(
 
 void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseAllocatorImpl(OrtEpFactory* /*this*/,
                                                                          OrtAllocator* allocator) noexcept {
-  delete static_cast<CUDAAllocator*>(allocator);
+  // no-op. The allocators will be shared across sessions.
+  // delete static_cast<CUDAAllocator*>(allocator);
 }
 
 OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl(
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h
@@ -2,6 +2,7 @@
 
 #include "ep_utils.h"
 #include "tensorrt_execution_provider_data_transfer.h"
+#include "cuda_allocator.h"
 
 using MemoryInfoUniquePtr = std::unique_ptr<OrtMemoryInfo, std::function<void(OrtMemoryInfo*)>>;
 
@@ -17,9 +18,12 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
   // CUDA gpu memory and CUDA pinned memory are required for allocator and data transfer, these are the OrtMemoryInfo
   // instance required for that.
   // Current TRT EP implementation uses one default OrtMemoryInfo and one host accessible OrtMemoryInfo per ep device.
-  std::vector<MemoryInfoUniquePtr> cuda_gpu_memory_infos;
-  std::vector<MemoryInfoUniquePtr> cuda_pinned_memory_infos;
-  std::unordered_map<uint32_t, const OrtMemoryInfo*> device_id_to_cuda_gpu_memory_info_map;  // device id -> OrtMemoryInfo
+  std::unordered_map<uint32_t, MemoryInfoUniquePtr> cuda_gpu_memory_infos; // device id -> memory info
+  std::unordered_map<uint32_t, MemoryInfoUniquePtr> cuda_pinned_memory_infos;
+
+  // Keeps allocators per ep device in factory so they can be shared across sessions.
+  std::unordered_map<uint32_t, std::unique_ptr<CUDAAllocator>> cuda_gpu_allocators; // device id -> allocator
+  std::unordered_map<uint32_t, std::unique_ptr<CUDAPinnedAllocator>> cuda_pinned_allocators;
 
   std::vector<const OrtMemoryDevice*> cuda_gpu_mem_devices;
   std::vector<const OrtMemoryDevice*> cuda_pinned_mem_devices;