update and sync with latest ep c api

chilo-ms · chilo-ms · commit ccf20da1d76b · 2025-07-23T10:44:23.000-07:00
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.cc b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.cc
@@ -9,9 +9,10 @@
 void CUDA_RETURN_IF_ERROR(cudaError_t res);
 
 /*static*/
-bool ORT_API_CALL TRTEpDataTransfer::CanCopyImpl(void* this_ptr, const OrtMemoryDevice* src_memory_device,
+bool ORT_API_CALL TRTEpDataTransfer::CanCopyImpl(const OrtDataTransferImpl* this_ptr,
+                                                 const OrtMemoryDevice* src_memory_device,
                                                  const OrtMemoryDevice* dst_memory_device) noexcept {
-  auto& impl = *static_cast<TRTEpDataTransfer*>(this_ptr);
+  auto& impl = *static_cast<const TRTEpDataTransfer*>(this_ptr);
 
   auto it = std::find_if(impl.cuda_gpu_mem_devices_.begin(), impl.cuda_gpu_mem_devices_.end(),
                          [&impl, &src_memory_device, &dst_memory_device](const OrtMemoryDevice* memory_device) {
@@ -29,7 +30,7 @@ bool ORT_API_CALL TRTEpDataTransfer::CanCopyImpl(void* this_ptr, const OrtMemory
 // function to copy one or more tensors.
 // implementation can optionally use async copy if a stream is available for the input.
 /*static*/
-OrtStatus* ORT_API_CALL TRTEpDataTransfer::CopyTensorsImpl(void* this_ptr,
+OrtStatus* ORT_API_CALL TRTEpDataTransfer::CopyTensorsImpl(OrtDataTransferImpl* this_ptr,
                                                            const OrtValue** src_tensors_ptr,
                                                            OrtValue** dst_tensors_ptr,
                                                            OrtSyncStream** streams_ptr,
@@ -97,10 +98,10 @@ OrtStatus* ORT_API_CALL TRTEpDataTransfer::CopyTensorsImpl(void* this_ptr,
 }
 
 /*static*/
-void ORT_API_CALL TRTEpDataTransfer::ReleaseImpl(void* this_ptr) noexcept {
+void ORT_API_CALL TRTEpDataTransfer::ReleaseImpl(OrtDataTransferImpl* this_ptr) noexcept {
   // In our setup the factory owns a shared ExampleDataTransfer instance so it will do the cleanup, and we ignore
   // the call to Release from the plugin_ep::DataTransfer dtor (see /onnxruntime/core/framework/plugin_data_transfer.h)
   //
   // If you create a new instance on each call to OrtEpFactory::CreateDataTransfer you call `delete` here
-  delete static_cast<TRTEpDataTransfer*>(this_ptr);
+  delete this_ptr;
 }
diff --git a/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.h b/plugin_execution_providers/tensorrt/tensorrt_execution_provider_data_transfer.h
@@ -4,27 +4,28 @@
 #pragma once
 
 #include "ep_utils.h"
+#include "onnxruntime_c_api.h"
 
 struct TRTEpDataTransfer : OrtDataTransferImpl, ApiPtrs {
-  TRTEpDataTransfer(ApiPtrs api_ptrs, std::vector<const OrtMemoryDevice*> device_mem_infos,
-                      std::vector<const OrtMemoryDevice*> shared_mem_infos)
+  TRTEpDataTransfer(ApiPtrs api_ptrs, std::vector<const OrtMemoryDevice*>& device_mem_infos,
+                      std::vector<const OrtMemoryDevice*>& shared_mem_infos)
       : ApiPtrs(api_ptrs), cuda_gpu_mem_devices_{device_mem_infos}, cuda_pinned_mem_devices_{shared_mem_infos} {
     CanCopy = CanCopyImpl;
     CopyTensors = CopyTensorsImpl;
     Release = ReleaseImpl;
   }
 
-  static bool ORT_API_CALL CanCopyImpl(void* this_ptr, const OrtMemoryDevice* src_memory_device,
+  static bool ORT_API_CALL CanCopyImpl(const OrtDataTransferImpl* this_ptr, const OrtMemoryDevice* src_memory_device,
                                        const OrtMemoryDevice* dst_memory_device) noexcept;
 
   // function to copy one or more tensors.
   // implementation can optionally use async copy if a stream is available for the input.
-  static OrtStatus* ORT_API_CALL CopyTensorsImpl(void* this_ptr, const OrtValue** src_tensors_ptr,
+  static OrtStatus* ORT_API_CALL CopyTensorsImpl(OrtDataTransferImpl* this_ptr, const OrtValue** src_tensors_ptr,
                                                  OrtValue** dst_tensors_ptr, OrtSyncStream** streams_ptr,
                                                  size_t num_tensors) noexcept;
-  static void ORT_API_CALL ReleaseImpl(void* this_ptr) noexcept;
+  static void ORT_API_CALL ReleaseImpl(OrtDataTransferImpl* this_ptr) noexcept;
 
  private:
-  std::vector<const OrtMemoryDevice*> cuda_gpu_mem_devices_;
-  std::vector<const OrtMemoryDevice*> cuda_pinned_mem_devices_;
+  std::vector<const OrtMemoryDevice*>& cuda_gpu_mem_devices_;
+  std::vector<const OrtMemoryDevice*>& cuda_pinned_mem_devices_;
 };
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.cc
@@ -27,6 +27,8 @@ TensorrtExecutionProviderFactory::TensorrtExecutionProviderFactory(const char* e
   ReleaseAllocator = ReleaseAllocatorImpl;
 
   CreateDataTransfer = CreateDataTransferImpl;
+
+  IsStreamAware = IsStreamAwareImpl;
 }
 
 const char* ORT_API_CALL TensorrtExecutionProviderFactory::GetNameImpl(const OrtEpFactory* this_ptr) noexcept {
@@ -80,24 +82,19 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp
   size_t& num_ep_devices = *p_num_ep_devices;
   auto* factory = static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
 
+  // Create two memory infos per device.
+  // The memory info is required to create allocator and gpu data transfer.
   int num_cuda_devices = 0;
   cudaGetDeviceCount(&num_cuda_devices);
   RETURN_IF_ERROR(factory->CreateMemoryInfoForDevices(num_cuda_devices));
 
-  std::vector<const OrtMemoryDevice*> cuda_gpu_mem_devices;
-  std::vector<const OrtMemoryDevice*> cuda_pinned_mem_devices;
   int32_t device_id = 0;
 
   for (size_t i = 0; i < num_devices && num_ep_devices < max_ep_devices; ++i) {
     // C API
     const OrtHardwareDevice& device = *devices[i];
-    if (factory->ort_api.HardwareDevice_Type(&device) == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
-      
-      // workaround for duplicate devices when using remote desktop.
-      if (device_id > 0) {
-        continue;
-      }
 
+    if (factory->ort_api.HardwareDevice_Type(&device) == OrtHardwareDeviceType::OrtHardwareDeviceType_GPU) {
       // These can be returned as nullptr if you have nothing to add.
       OrtKeyValuePairs* ep_metadata = nullptr;
       OrtKeyValuePairs* ep_options = nullptr;
@@ -129,8 +126,8 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp
       RETURN_IF_ERROR(factory->ep_api.EpDevice_AddAllocatorInfo(ep_device, cuda_pinned_mem_info));
 
       // Get memory device from memory info for gpu data transfer
-      cuda_gpu_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_gpu_mem_info));
-      cuda_pinned_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_pinned_mem_info));
+      factory->cuda_gpu_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_gpu_mem_info));
+      factory->cuda_pinned_mem_devices.push_back(factory->ep_api.MemoryInfo_GetMemoryDevice(cuda_pinned_mem_info));
 
       ep_devices[num_ep_devices++] = ep_device;
       ++device_id;
@@ -152,10 +149,12 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::GetSupportedDevicesImp
 
     // Create gpu data transfer
   auto data_transfer_impl = std::make_unique<TRTEpDataTransfer>(static_cast<const ApiPtrs&>(*factory),
-                                                                cuda_gpu_mem_devices,    // device memory
-                                                                cuda_pinned_mem_devices  // shared memory
+                                                                factory->cuda_gpu_mem_devices,  // device memory
+                                                                factory->cuda_pinned_mem_devices  // shared memory
                                                                );
-  factory->SetGPUDataTransfer(std::move(data_transfer_impl));
+
+  factory->data_transfer_impl = std::move(data_transfer_impl);
+
   return nullptr;
 }
 
@@ -244,13 +243,13 @@ OrtStatus* ORT_API_CALL TensorrtExecutionProviderFactory::CreateDataTransferImpl
                                                                  OrtEpFactory* this_ptr,
                                                                  OrtDataTransferImpl** data_transfer) noexcept {
   auto& factory = *static_cast<TensorrtExecutionProviderFactory*>(this_ptr);
-  *data_transfer = factory.data_transfer_impl_.get();
+  *data_transfer = factory.data_transfer_impl.get();
 
   return nullptr;
 }
 
-void TensorrtExecutionProviderFactory::SetGPUDataTransfer(std::unique_ptr<TRTEpDataTransfer> gpu_data_transfer) {
-  data_transfer_impl_ = std::move(gpu_data_transfer);
+bool ORT_API_CALL TensorrtExecutionProviderFactory::IsStreamAwareImpl(const OrtEpFactory* /*this_ptr*/) noexcept {
+  return false;
 }
 
 // To make symbols visible on macOS/iOS
@@ -265,6 +264,7 @@ extern "C" {
 // Public symbols
 //
 EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const OrtApiBase* ort_api_base,
+                                           const OrtLogger*,
                                            OrtEpFactory** factories, size_t max_factories, size_t* num_factories) {
   const OrtApi* ort_api = ort_api_base->GetApi(ORT_API_VERSION);
   const OrtEpApi* ort_ep_api = ort_api->GetEpApi();
@@ -285,7 +285,7 @@ EXPORT_SYMBOL OrtStatus* CreateEpFactories(const char* registration_name, const
 }
 
 EXPORT_SYMBOL OrtStatus* ReleaseEpFactory(OrtEpFactory* factory) {
-  delete factory;
+  delete static_cast<TensorrtExecutionProviderFactory*>(factory);
   return nullptr;
 }
 
diff --git a/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h b/plugin_execution_providers/tensorrt/tensorrt_provider_factory.h
@@ -12,10 +12,6 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
  public:
   TensorrtExecutionProviderFactory(const char* ep_name, ApiPtrs apis);
 
-  const OrtMemoryInfo* GetDefaultGpuMemInfoForDeviceId(uint32_t device_id) const;
-
-  const OrtMemoryInfo* GetHostAccessibleMemInfoForDeviceId(uint32_t device_id) const;
-
   OrtStatus* CreateMemoryInfoForDevices(int num_devices);
 
   // CUDA gpu memory and CUDA pinned memory are required for allocator and data transfer, these are the OrtMemoryInfo
@@ -25,6 +21,10 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
   std::vector<MemoryInfoUniquePtr> cuda_pinned_memory_infos;
   std::unordered_map<uint32_t, const OrtMemoryInfo*> device_id_to_cuda_gpu_memory_info_map;  // device id -> OrtMemoryInfo
 
+  std::vector<const OrtMemoryDevice*> cuda_gpu_mem_devices;
+  std::vector<const OrtMemoryDevice*> cuda_pinned_mem_devices;
+  std::unique_ptr<TRTEpDataTransfer> data_transfer_impl;  // data transfer implementation for this factory
+
  private:
   static const char* ORT_API_CALL GetNameImpl(const OrtEpFactory* this_ptr) noexcept;
 
@@ -53,11 +53,11 @@ struct TensorrtExecutionProviderFactory : public OrtEpFactory, public ApiPtrs {
   static OrtStatus* ORT_API_CALL CreateDataTransferImpl(OrtEpFactory* this_ptr,
                                                         OrtDataTransferImpl** data_transfer) noexcept;
 
+  static bool ORT_API_CALL IsStreamAwareImpl(const OrtEpFactory* /*this_ptr*/) noexcept;
+
   void SetGPUDataTransfer(std::unique_ptr<TRTEpDataTransfer> gpu_data_transfer);
 
   const std::string ep_name_;           // EP name
   const std::string vendor_{"Nvidia"};  // EP vendor name
   const std::string ep_version_{"0.1.0"};  // EP version
-
-  std::unique_ptr<TRTEpDataTransfer> data_transfer_impl_;  // data transfer implementation for this factory
 };