Skip to content

Commit 5f46b68

Browse files
committed
fix mem leak for OrtAllocator
1 parent edd4b34 commit 5f46b68

File tree

4 files changed

+68
-50
lines changed

4 files changed

+68
-50
lines changed

plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2071,11 +2071,15 @@ OrtStatus* TensorrtExecutionProvider::RefitEngine(
20712071
#endif
20722072
}
20732073

2074-
TensorrtExecutionProvider::~TensorrtExecutionProvider() = default;
2074+
TensorrtExecutionProvider::~TensorrtExecutionProvider() {
2075+
if (alloc_ != nullptr) {
2076+
ort_api.ReleaseAllocator(alloc_);
2077+
}
2078+
}
20752079

20762080
/// <summary>
20772081
///
2078-
/// Plugin TensorRT EP that implements OrtEp
2082+
/// Plugin TensorRT EP implementing OrtEp
20792083
///
20802084
/// </summary>
20812085
TensorrtExecutionProvider::TensorrtExecutionProvider(TensorrtExecutionProviderFactory& factory,
@@ -2494,18 +2498,17 @@ OrtStatus* TRTEpNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_ptr, void*
24942498
auto& dds_output_allocator_map = dds_output_allocator_maps[fused_node_name];
24952499

24962500
// Get default OrtMemoryInfo from factory
2497-
// Get allocator from OrtKernelContext
24982501
const OrtMemoryInfo* mem_info = nullptr;
2499-
if (ep.factory_.device_id_to_cuda_gpu_memory_info_map.find(device_id) !=
2500-
ep.factory_.device_id_to_cuda_gpu_memory_info_map.end()) {
2501-
mem_info = ep.factory_.device_id_to_cuda_gpu_memory_info_map[device_id];
2502+
if (ep.factory_.cuda_gpu_memory_infos.find(device_id) !=
2503+
ep.factory_.cuda_gpu_memory_infos.end()) {
2504+
mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get();
25022505
}
2503-
OrtAllocator* alloc = nullptr;
2504-
ep.GetAllocator(&alloc);
2505-
if (alloc == nullptr) {
2506-
Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &alloc));
2507-
ep.SetAllocator(alloc);
2506+
2507+
// Get allocator from OrtKernelContext
2508+
if (ep.alloc_ == nullptr) {
2509+
Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &ep.alloc_));
25082510
}
2511+
OrtAllocator* alloc = ep.alloc_;
25092512

25102513
void* cuda_stream;
25112514
Ort::ThrowOnError(ep.ort_api.KernelContext_GetGPUComputeStream(kernel_context, &cuda_stream));
@@ -3134,18 +3137,17 @@ OrtStatus* TRTEpEpContextNodeComputeInfo::ComputeImpl(OrtNodeComputeInfo* this_p
31343137
std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64; // same as above but for int64 shape tensor input
31353138

31363139
// Get default OrtMemoryInfo from factory
3137-
// Get allocator from OrtKernelContext
31383140
const OrtMemoryInfo* mem_info = nullptr;
3139-
if (ep.factory_.device_id_to_cuda_gpu_memory_info_map.find(device_id) !=
3140-
ep.factory_.device_id_to_cuda_gpu_memory_info_map.end()) {
3141-
mem_info = ep.factory_.device_id_to_cuda_gpu_memory_info_map[device_id];
3141+
if (ep.factory_.cuda_gpu_memory_infos.find(device_id) !=
3142+
ep.factory_.cuda_gpu_memory_infos.end()) {
3143+
mem_info = ep.factory_.cuda_gpu_memory_infos[device_id].get();
31423144
}
3143-
OrtAllocator* alloc = nullptr;
3144-
ep.GetAllocator(&alloc);
3145-
if (alloc == nullptr) {
3146-
Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &alloc));
3147-
ep.SetAllocator(alloc);
3145+
3146+
// Get allocator from OrtKernelContext
3147+
if (ep.alloc_ == nullptr) {
3148+
Ort::ThrowOnError(ep.ort_api.KernelContext_GetAllocator(kernel_context, mem_info, &ep.alloc_));
31483149
}
3150+
OrtAllocator* alloc = ep.alloc_;
31493151

31503152
void* cuda_stream;
31513153
Ort::ThrowOnError(ep.ort_api.KernelContext_GetGPUComputeStream(kernel_context, &cuda_stream));

plugin_execution_providers/tensorrt/tensorrt_execution_provider.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
264264
const void* onnx_model_bytestream, size_t onnx_model_bytestream_size,
265265
nvinfer1::ICudaEngine* trt_engine, bool serialize_refitted_engine,
266266
bool detailed_build_log);
267-
268-
void GetAllocator(OrtAllocator** alloc) const { *alloc = alloc_; }
269-
270-
void SetAllocator(OrtAllocator* alloc) { alloc_ = alloc; }
271267

272268
std::unordered_map<std::string, DDSOutputAllocatorMap>& GetDDSOutputAllocators() {
273269
return dds_output_allocator_maps_;
@@ -314,6 +310,10 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
314310
bool external_stream_ = false;
315311
cudaStream_t stream_ = nullptr;
316312

313+
// The OrtAllocator object will be get during ep compute time
314+
// and should be kept for the lifetime of TRT EP object.
315+
OrtAllocator* alloc_ = nullptr;
316+
317317
private:
318318
static const char* ORT_API_CALL GetNameImpl(const OrtEp* this_ptr) noexcept;
319319
static OrtStatus* ORT_API_CALL GetCapabilityImpl(OrtEp* this_ptr, const OrtGraph* graph,
@@ -375,10 +375,6 @@ struct TensorrtExecutionProvider : public OrtEp, public ApiPtrs {
375375
std::string cache_prefix_;
376376
bool engine_hw_compatible_ = false;
377377

378-
// The OrtAllocator object will be get during ep compute time
379-
// and should be kept for the lifetime of TRT EP object.
380-
OrtAllocator* alloc_ = nullptr;
381-
382378
// For create/dump EP context node model
383379
bool dump_ep_context_model_ = false;
384380
std::string ep_context_file_path_;

plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_
5757
/* device_id */ device_id, OrtDeviceMemoryType_DEFAULT,
5858
/*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info));
5959

60-
cuda_gpu_memory_infos.emplace_back(MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo));
60+
cuda_gpu_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo);
6161

6262
// HOST_ACCESSIBLE memory should use the non-CPU device type
6363
mem_info = nullptr;
@@ -66,7 +66,7 @@ OrtStatus* TensorrtExecutionProviderFactory::CreateMemoryInfoForDevices(int num_
6666
/* device_id */ device_id, OrtDeviceMemoryType_HOST_ACCESSIBLE,
6767
/*alignment*/ 0, OrtAllocatorType::OrtDeviceAllocator, &mem_info));
6868

69-
cuda_pinned_memory_infos.emplace_back(MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo));
69+
cuda_pinned_memory_infos[device_id] = MemoryInfoUniquePtr(mem_info, ort_api.ReleaseMemoryInfo);
7070
}
7171

7272
return nullptr;
@@ -196,35 +196,50 @@ void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseEpImpl(OrtEpFactory*
196196
delete trt_ep;
197197
}
198198

199-
OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(
200-
OrtEpFactory* this_ptr, const OrtMemoryInfo* memory_info,
201-
const OrtKeyValuePairs* /*allocator_options*/,
202-
OrtAllocator** allocator) noexcept {
199+
OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(OrtEpFactory* this_ptr,
200+
const OrtMemoryInfo* memory_info,
201+
const OrtKeyValuePairs* /*allocator_options*/,
202+
OrtAllocator** allocator) noexcept {
203203
auto& factory = *static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
204-
*allocator = nullptr;
205204

206-
// NOTE: The factory implementation can return a shared OrtAllocator* instead of creating a new instance on each call.
207-
// To do this just make ReleaseAllocatorImpl a no-op.
205+
// NOTE: The factory implementation is free to return a shared OrtAllocator* instance instead of creating a new
206+
// allocator on each call. To do this have an allocator instance as an OrtEpFactory class member and make
207+
// ReleaseAllocatorImpl a no-op.
208208

209-
// NOTE: If OrtMemoryInfo has allocator type (call MemoryInfoGetType) of OrtArenaAllocator, an ORT BFCArena
210-
// will be added to wrap the returned OrtAllocator. The EP is free to implement its own arena, and if it
211-
// wants to do this the OrtMemoryInfo MUST be created with an allocator type of OrtDeviceAllocator.
212-
213-
// NOTE: The OrtMemoryInfo pointer should only ever be coming straight from an OrtEpDevice, and pointer based
214-
// matching should work.
209+
// NOTE: EP should implement its own arena logic. ep_arena.cc/h is provided as a reference and we use it here for
210+
// device memory. `allocator_options` can be used for arena configuration and there is a helper in ep_arena.h
211+
// to convert from OrtKeyValuePairs to the same arena config settings that ORT uses.
212+
// You are of course free to have completely different settings.
215213

216214
const OrtMemoryDevice* mem_device = factory.ep_api.MemoryInfo_GetMemoryDevice(memory_info);
217215
uint32_t device_id = factory.ep_api.MemoryDevice_GetDeviceId(mem_device);
218216

219217
if (factory.ep_api.MemoryDevice_GetMemoryType(mem_device) == OrtDeviceMemoryType_DEFAULT) {
218+
// use the one that previously created
219+
if (factory.cuda_gpu_allocators.find(device_id) != factory.cuda_gpu_allocators.end()) {
220+
*allocator = factory.cuda_gpu_allocators[device_id].get();
221+
return nullptr;
222+
}
223+
220224
// create a CUDA allocator
221225
auto cuda_allocator = std::make_unique<CUDAAllocator>(memory_info, static_cast<DeviceId>(device_id));
222-
factory.device_id_to_cuda_gpu_memory_info_map[device_id] = memory_info;
223-
*allocator = cuda_allocator.release();
226+
227+
*allocator = cuda_allocator.get();
228+
factory.cuda_gpu_allocators[device_id] = std::move(cuda_allocator);
229+
224230
} else if (factory.ep_api.MemoryDevice_GetMemoryType(mem_device) == OrtDeviceMemoryType_HOST_ACCESSIBLE) {
231+
// use the one that previously created
232+
if (factory.cuda_pinned_allocators.find(device_id) != factory.cuda_pinned_allocators.end()) {
233+
*allocator = factory.cuda_pinned_allocators[device_id].get();
234+
return nullptr;
235+
}
236+
225237
// create a CUDA PINNED allocator
226238
auto cuda_pinned_allocator = std::make_unique<CUDAPinnedAllocator>(memory_info);
227-
*allocator = cuda_pinned_allocator.release();
239+
240+
*allocator = cuda_pinned_allocator.get();
241+
factory.cuda_pinned_allocators[device_id] = std::move(cuda_pinned_allocator);
242+
228243
} else {
229244
return factory.ort_api.CreateStatus(ORT_INVALID_ARGUMENT,
230245
"INTERNAL ERROR! Unknown memory info provided to CreateAllocator. "
@@ -236,7 +251,8 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateAllocatorImpl(
236251

237252
void ORT_API_CALL TensorrtExecutionProviderFactory::ReleaseAllocatorImpl(OrtEpFactory* /*this*/,
238253
OrtAllocator* allocator) noexcept {
239-
delete static_cast<CUDAAllocator*>(allocator);
254+
// no-op. The allocators will be shared across sessions.
255+
// delete static_cast<CUDAAllocator*>(allocator);
240256
}
241257

242258
OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl(

plugin_execution_providers/tensorrt/tensorrt_provider_factory.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "ep_utils.h"
44
#include "tensorrt_execution_provider_data_transfer.h"
5+
#include "cuda_allocator.h"
56

67
using MemoryInfoUniquePtr = std::unique_ptr<OrtMemoryInfo, std::function<void(OrtMemoryInfo*)>>;
78

@@ -17,9 +18,12 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
1718
// CUDA gpu memory and CUDA pinned memory are required for allocator and data transfer, these are the OrtMemoryInfo
1819
// instance required for that.
1920
// Current TRT EP implementation uses one default OrtMemoryInfo and one host accessible OrtMemoryInfo per ep device.
20-
std::vector<MemoryInfoUniquePtr> cuda_gpu_memory_infos;
21-
std::vector<MemoryInfoUniquePtr> cuda_pinned_memory_infos;
22-
std::unordered_map<uint32_t, const OrtMemoryInfo*> device_id_to_cuda_gpu_memory_info_map; // device id -> OrtMemoryInfo
21+
std::unordered_map<uint32_t, MemoryInfoUniquePtr> cuda_gpu_memory_infos; // device id -> memory info
22+
std::unordered_map<uint32_t, MemoryInfoUniquePtr> cuda_pinned_memory_infos;
23+
24+
// Keeps allocators per ep device in factory so they can be shared across sessions.
25+
std::unordered_map<uint32_t, std::unique_ptr<CUDAAllocator>> cuda_gpu_allocators; // device id -> allocator
26+
std::unordered_map<uint32_t, std::unique_ptr<CUDAPinnedAllocator>> cuda_pinned_allocators;
2327

2428
std::vector<const OrtMemoryDevice*> cuda_gpu_mem_devices;
2529
std::vector<const OrtMemoryDevice*> cuda_pinned_mem_devices;

0 commit comments

Comments
 (0)