add allocator and data transfer

chilo-ms · chilo-ms · commit be453b11549a · 2025-07-01T17:46:29.000-07:00
diff --git a/plugin_execution_providers/tensorrt/cuda_allocator.cc b/plugin_execution_providers/tensorrt/cuda_allocator.cc
@@ -3,11 +3,10 @@
 
 #include <cassert>
 #include <cuda_runtime_api.h>
-#include "tensorrt_cuda_allocator.h"
+#include "cuda_allocator.h"
 
 void CUDA_RETURN_IF_ERROR(cudaError_t res);
 
-namespace onnxruntime {
 void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
 #ifndef NDEBUG
   // check device to match at debug build
@@ -75,5 +74,3 @@ void CUDAPinnedAllocator::Free(void* p) {
 const OrtMemoryInfo* CUDAPinnedAllocator::Info() const {
   return mem_info_;
 }
-
-}  // namespace onnxruntime
diff --git a/plugin_execution_providers/tensorrt/cuda_allocator.h b/plugin_execution_providers/tensorrt/cuda_allocator.h
@@ -7,16 +7,13 @@
 #define ORT_API_MANUAL_INIT
 #include "onnxruntime_cxx_api.h"
 
-namespace onnxruntime {
-
-// Following names are originally defined in allocator.h
 constexpr const char* CUDA_ALLOCATOR = "Cuda";
 constexpr const char* CUDA_PINNED_ALLOCATOR = "CudaPinned";
 
 using DeviceId = int16_t;
 
 struct CUDAAllocator : OrtAllocator {
-  CUDAAllocator(DeviceId device_id, const char* name = onnxruntime::CUDA_ALLOCATOR) {
+  CUDAAllocator(DeviceId device_id, const char* name = CUDA_ALLOCATOR) {
     OrtAllocator::version = ORT_API_VERSION;
     OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<CUDAAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<CUDAAllocator*>(this_)->Free(p); };
@@ -31,6 +28,7 @@ struct CUDAAllocator : OrtAllocator {
                           OrtMemType::OrtMemTypeDefault,
                           &mem_info_);
   }
+  // TODO: Handle destructor
   //~CUDAAllocator();
 
   void* Alloc(size_t size);
@@ -50,7 +48,7 @@ struct CUDAAllocator : OrtAllocator {
 };
 
 struct CUDAPinnedAllocator : OrtAllocator {
-  CUDAPinnedAllocator(const char* name = onnxruntime::CUDA_PINNED_ALLOCATOR) {
+  CUDAPinnedAllocator(const char* name = CUDA_PINNED_ALLOCATOR) {
     OrtAllocator::version = ORT_API_VERSION;
     OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<CUDAPinnedAllocator*>(this_)->Alloc(size); };
     OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<CUDAPinnedAllocator*>(this_)->Free(p); };
@@ -62,6 +60,7 @@ struct CUDAPinnedAllocator : OrtAllocator {
                           OrtMemType::OrtMemTypeDefault,
                           &mem_info_);
   }
+  // TODO: Handle destructor
   //~CUDAPinnedAllocator();
 
   void* Alloc(size_t size);
@@ -77,6 +76,3 @@ struct CUDAPinnedAllocator : OrtAllocator {
   DeviceId device_id_ = 0;
   OrtMemoryInfo* mem_info_ = nullptr;
 };
-
-
-}  // namespace onnxruntime
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.cc
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider.h
@@ -152,6 +152,9 @@ class OutputAllocator : public nvinfer1::IOutputAllocator {
 
 using ShapeRangesMap = std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>>;
 
+template <typename T>
+using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;
+
 struct TensorrtComputeState {
   std::string fused_node_name;
   nvinfer1::IBuilder* builder;
@@ -168,14 +171,14 @@ struct TensorrtComputeState {
   bool int8_calibration_cache_available = false;
   bool dla_enable = false;
   int dla_core = 0;
-  size_t* max_workspace_size_ptr = nullptr;
   std::string trt_node_name_with_precision;
   bool engine_cache_enable = false;
   std::string engine_cache_path;
   nvinfer1::IRuntime* runtime = nullptr;
   std::vector<nvinfer1::IOptimizationProfile*> profiles;
   bool context_memory_sharing_enable = false;
   size_t* max_context_mem_size_ptr = nullptr;
+  IAllocatorUniquePtr<void>* context_memory = nullptr;
   std::unordered_map<std::string, float> dynamic_range_map;
   bool engine_decryption_enable = false;
   int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
@@ -215,11 +218,6 @@ static const std::string k_cc_hw_compatible = "80+";
 static const std::string k_ep_ctx_hardware_architecture = "hardware_architecture";
 static const std::string k_ep_ctx_onnx_model_filename = "onnx_model_filename";
 
-struct ApiPtrs {
-  const OrtApi& ort_api;
-  const OrtEpApi& ep_api;
-};
-
 /// <summary>
 /// 
 /// Plugin TensorRT EP OrtNodeComputeInfo that represents the computation function for a compiled OrtGraph.
@@ -346,7 +344,7 @@ struct TensorrtExecutionProvider : OrtEp, ApiPtrs {
   bool context_memory_sharing_enable_ = false;
   bool layer_norm_fp32_fallback_ = false;
   size_t max_ctx_mem_size_ = 0;
-  //  IAllocatorUniquePtr<void> context_memory_ = nullptr;
+  IAllocatorUniquePtr<void> context_memory_ = nullptr;
   mutable char model_path_[4096] = {};  // Reserved for max path length
   bool engine_decryption_enable_ = false;
   int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.cc
@@ -0,0 +1,99 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "tensorrt_execution_provider_data_transfer.h"
+
+#include <cassert>
+#include <gsl/span>
+
+void CUDA_RETURN_IF_ERROR(cudaError_t res);
+
+/*static*/
+bool ORT_API_CALL TRTEpDataTransfer::CanCopyImpl(void* this_ptr,
+                                                   const OrtMemoryDevice* src_memory_device,
+                                                   const OrtMemoryDevice* dst_memory_device) noexcept {
+  auto& impl = *static_cast<TRTEpDataTransfer*>(this_ptr);
+  bool src_is_our_device = impl.ep_api.MemoryDevice_AreEqual(src_memory_device, impl.device_mem_info);
+  bool dst_is_our_device = impl.ep_api.MemoryDevice_AreEqual(dst_memory_device, impl.device_mem_info);
+
+  return src_is_our_device || dst_is_our_device;
+}
+
+// function to copy one or more tensors.
+// implementation can optionally use async copy if a stream is available for the input.
+/*static*/
+OrtStatus* ORT_API_CALL TRTEpDataTransfer::CopyTensorsImpl(void* this_ptr,
+                                                           const OrtValue** src_tensors_ptr,
+                                                           OrtValue** dst_tensors_ptr,
+                                                           OrtSyncStream** streams_ptr,
+                                                           size_t num_tensors) noexcept {
+  auto& impl = *static_cast<TRTEpDataTransfer*>(this_ptr);
+
+  auto src_tensors = gsl::make_span<const OrtValue*>(src_tensors_ptr, num_tensors);
+  auto dst_tensors = gsl::make_span<OrtValue*>(dst_tensors_ptr, num_tensors);
+  auto streams = gsl::make_span<OrtSyncStream*>(streams_ptr, num_tensors);
+
+  for (size_t i = 0; i < num_tensors; ++i) {
+    // NOTE: Stream support will be a separate PR. ignore teh streams_ptr values for now
+
+    const OrtMemoryDevice* src_device = nullptr;
+    const OrtMemoryDevice* dst_device = nullptr;
+    RETURN_IF_ERROR(impl.ep_api.Value_GetMemoryDevice(src_tensors[i], &src_device));
+    RETURN_IF_ERROR(impl.ep_api.Value_GetMemoryDevice(dst_tensors[i], &dst_device));
+
+    OrtMemoryInfoDeviceType src_device_type = impl.ep_api.MemoryDevice_GetDeviceType(src_device);
+    OrtMemoryInfoDeviceType dst_device_type = impl.ep_api.MemoryDevice_GetDeviceType(dst_device);
+    OrtDeviceMemoryType src_mem_type = impl.ep_api.MemoryDevice_GetMemoryType(src_device);
+    OrtDeviceMemoryType dst_mem_type = impl.ep_api.MemoryDevice_GetMemoryType(dst_device);
+    bool copy_involves_pinned_memory = src_mem_type == OrtDeviceMemoryType_HOST_ACCESSIBLE ||
+                                       dst_mem_type == OrtDeviceMemoryType_HOST_ACCESSIBLE;
+
+    const void* src_data = nullptr;
+    void* dst_data = nullptr;
+    RETURN_IF_ERROR(impl.ort_api.GetTensorData(src_tensors[i], &src_data));
+    RETURN_IF_ERROR(impl.ort_api.GetTensorMutableData(dst_tensors[i], &dst_data));
+
+    size_t bytes = 0;
+    RETURN_IF_ERROR(impl.ort_api.GetTensorSizeInBytes(reinterpret_cast<const OrtValue*>(src_data), &bytes));
+
+    // for the sync version of memcpy, launch to cuda default stream
+    if (dst_device_type == OrtMemoryInfoDeviceType_GPU) {
+      if (src_device_type == OrtMemoryInfoDeviceType_GPU) {
+        // GPU -> GPU
+        // Copy only if the two addresses are different and bytes > 0.
+        if (dst_data != src_data && bytes > 0) {
+          CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToDevice));
+          // For device memory to device memory copy, no host-side synchronization is performed by cudaMemcpy.
+          // see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
+          CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
+        }
+      } else {
+        // CPU -> GPU, this is blocking
+        CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyHostToDevice));
+        if (src_mem_type != OrtDeviceMemoryType_HOST_ACCESSIBLE) {
+          // For cudaMemcpy from pageable host memory to device memory, DMA to final destination may not have completed.
+          // see https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html
+          CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
+        }
+      }
+    } else if (src_device_type == OrtMemoryInfoDeviceType_GPU) {
+      // GPU -> CPU, this is blocking
+      CUDA_RETURN_IF_ERROR(cudaMemcpy(dst_data, src_data, bytes, cudaMemcpyDeviceToHost));
+    } else {
+      // CPU -> CPU involves copy to/from pinned memory and a synchronize may be required first
+      //ORT_ENFORCE(dst_data != src_data);
+      memcpy(dst_data, src_data, bytes);
+    }
+  }
+
+  return nullptr;
+}
+
+/*static*/
+void ORT_API_CALL TRTEpDataTransfer::ReleaseImpl(void* this_ptr) noexcept {
+  // In our setup the factory owns a shared ExampleDataTransfer instance so it will do the cleanup, and we ignore
+  // the call to Release from the plugin_ep::DataTransfer dtor (see /onnxruntime/core/framework/plugin_data_transfer.h)
+  //
+  // If you create a new instance on each call to OrtEpFactory::CreateDataTransfer you call `delete` here
+  delete static_cast<TRTEpDataTransfer*>(this_ptr);
+}
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.h
@@ -0,0 +1,30 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "tensorrt_execution_provider_utils.h"
+
+struct TRTEpDataTransfer : OrtDataTransferImpl, ApiPtrs {
+  TRTEpDataTransfer(ApiPtrs api_ptrs, const OrtMemoryDevice* device_mem_info_,
+                      const OrtMemoryDevice* shared_mem_info_ = nullptr)
+      : ApiPtrs(api_ptrs), device_mem_info{device_mem_info_}, shared_mem_info{shared_mem_info_} {
+    CanCopy = CanCopyImpl;
+    CopyTensors = CopyTensorsImpl;
+    Release = ReleaseImpl;
+  }
+
+  static bool ORT_API_CALL CanCopyImpl(void* this_ptr, const OrtMemoryDevice* src_memory_device,
+                                       const OrtMemoryDevice* dst_memory_device) noexcept;
+
+  // function to copy one or more tensors.
+  // implementation can optionally use async copy if a stream is available for the input.
+  static OrtStatus* ORT_API_CALL CopyTensorsImpl(void* this_ptr, const OrtValue** src_tensors_ptr,
+                                                 OrtValue** dst_tensors_ptr, OrtSyncStream** streams_ptr,
+                                                 size_t num_tensors) noexcept;
+  static void ORT_API_CALL ReleaseImpl(void* this_ptr) noexcept;
+
+ private:
+  const OrtMemoryDevice* device_mem_info;
+  const OrtMemoryDevice* shared_mem_info;
+};
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_info.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_info.cc
@@ -7,7 +7,6 @@
 #include "provider_options_utils.h"
 #include "cuda/cuda_common.h"
 
-namespace onnxruntime {
 namespace tensorrt {
 namespace provider_option_names {
 constexpr const char* kDeviceId = "device_id";
@@ -336,4 +335,3 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
 //  trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path);
 //  trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible;
 //}
-}  // namespace onnxruntime
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_info.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_info.h
@@ -9,7 +9,6 @@
 
 #define TRT_DEFAULT_OPTIMIZER_LEVEL 3
 
-namespace onnxruntime {
 // Information needed to construct trt execution providers.
 struct TensorrtExecutionProviderInfo {
   int device_id{0};
@@ -55,11 +54,10 @@ struct TensorrtExecutionProviderInfo {
   std::string engine_cache_prefix{""};
   bool engine_hw_compatible{false};
 
-  static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
+  static TensorrtExecutionProviderInfo FromProviderOptions(const onnxruntime::ProviderOptions& options);
 //  static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info);
 //  static ProviderOptions ToProviderOptions(const OrtTensorRTProviderOptionsV2& info);
 //  static void UpdateProviderOptions(void* provider_options, const ProviderOptions& options, bool string_copy);
 //
 //  std::vector<OrtCustomOpDomain*> custom_op_domain_list;
 };
-}  // namespace onnxruntime
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_utils.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_utils.h
@@ -1,20 +1,45 @@
+#define ORT_API_MANUAL_INIT
+#include "onnxruntime_cxx_api.h"
+#undef ORT_API_MANUAL_INIT
+
+#include "flatbuffers/idl.h"
+#include "ort_trt_int8_cal_table.fbs.h"
+// #include "core/providers/cuda/cuda_pch.h"
+// #include "core/common/path_string.h"
+// #include "core/framework/murmurhash3.h"
+
+#include"nv_includes.h"
+
 #include <fstream>
 #include <unordered_map>
 #include <string>
 #include <vector>
 #include <sstream>
 #include <iostream>
 #include <filesystem>
-#include "flatbuffers/idl.h"
-#include "ort_trt_int8_cal_table.fbs.h"
-#include <NvInferVersion.h>
-//#include "core/providers/cuda/cuda_pch.h"
-//#include "core/common/path_string.h"
-//#include "core/framework/murmurhash3.h"
 
-namespace fs = std::filesystem;
+#define RETURN_IF_ERROR(fn)    \
+  do {                         \
+    OrtStatus* _status = (fn); \
+    if (_status != nullptr) {  \
+      return _status;          \
+    }                          \
+  } while (0)
+
+#define RETURN_IF(cond, ort_api, msg)                    \
+  do {                                                   \
+    if ((cond)) {                                        \
+      return (ort_api).CreateStatus(ORT_EP_FAIL, (msg)); \
+    }                                                    \
+  } while (0)
+
+struct ApiPtrs {
+  const OrtApi& ort_api;
+  const OrtEpApi& ep_api;
+  const OrtModelEditorApi& model_editor_api;
+};
 
-//namespace onnxruntime {
+namespace fs = std::filesystem;
 
 // Check if cycle exists in the graph after partitioning
 /*
@@ -143,6 +168,7 @@ std::vector<std::string> SplitToStringVec(std::string const& s, char separator)
   return splitted;
 }
 
+/*
 nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) {
   nvinfer1::TacticSources disabledTactics = 0;
   nvinfer1::TacticSources enabledTactics = 0;
@@ -197,6 +223,7 @@ nvinfer1::TacticSources GetTacticSourceFromString(std::string& tactic_string) {
   }
   return enabledTactics & ~disabledTactics;
 }
+*/
 
 inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
   std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
@@ -968,4 +995,3 @@ std::string GetCacheSuffix(const std::string& fused_node_name, const std::string
   }
   return "";
 }
-//}  // namespace onnxruntime
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h