NvTensorRtRtx dependency on CUDA device name removed (microsoft#1485)

BLSharda · web-flow · commit b8904ac2864c · 2025-05-28T21:31:27.000-07:00
diff --git a/src/cuda/interface.cpp b/src/cuda/interface.cpp
@@ -9,6 +9,10 @@
 #include "kernels.h"
 #include <cstdarg>
 
+#if defined(_WIN32) || defined(_WIN64)
+#define strcasecmp _stricmp
+#endif
+
 namespace Generators {
 
 GenaiInterface* gp_genai{};
@@ -68,16 +72,14 @@ struct GpuMemory final : DeviceBuffer {
   bool owned_;  // If we own the memory, we delete it on destruction
 };
 
-struct CudaInterfaceImpl final : DeviceInterface {
-  CudaInterfaceImpl() {
+struct CudaInterfaceImplBase : DeviceInterface {
+  CudaInterfaceImplBase() {
     g_stream.Create();
   }
 
-  ~CudaInterfaceImpl() {
+  ~CudaInterfaceImplBase() {
   }
 
-  DeviceType GetType() const override { return DeviceType::CUDA; }
-
   void InitOrt(const OrtApi& api, Ort::Allocator& allocator) override {
     Ort::api = &api;
     assert(!ort_allocator_);
@@ -164,6 +166,14 @@ struct CudaInterfaceImpl final : DeviceInterface {
   }
 };
 
+struct CudaInterfaceImpl final : CudaInterfaceImplBase {
+  DeviceType GetType() const override { return DeviceType::CUDA; }
+};
+
+struct NvTensorRtRtxInterfaceImpl final : CudaInterfaceImplBase {
+  DeviceType GetType() const override { return DeviceType::NvTensorRtRtx; }
+};
+
 std::unique_ptr<DeviceInterface> g_cuda_device;
 
 DeviceInterface& GetCudaDeviceInterface() { return *g_cuda_device; }
@@ -205,9 +215,13 @@ void operator delete(void* p, size_t /*size*/) noexcept { Generators::gp_genai->
 #endif
 
 extern "C" {
-Generators::DeviceInterface* GetInterface(GenaiInterface* p_genai) {
+Generators::DeviceInterface* GetInterface(GenaiInterface* p_genai, const char* deviceType) {
   Generators::gp_genai = p_genai;
-  Generators::g_cuda_device = std::make_unique<Generators::CudaInterfaceImpl>();
+  if (strcasecmp(deviceType, "NvTensorRtRtx") == 0) {
+    Generators::g_cuda_device = std::make_unique<Generators::NvTensorRtRtxInterfaceImpl>();
+  } else {
+    Generators::g_cuda_device = std::make_unique<Generators::CudaInterfaceImpl>();
+  }
   return Generators::g_cuda_device.get();
 }
 }
diff --git a/src/generators.cpp b/src/generators.cpp
@@ -178,7 +178,8 @@ struct LibraryHandle {
 };
 #endif
 
-DeviceInterface* GetCudaInterface() {
+DeviceInterface* GetCudaInterface(DeviceType type) {
+  assert(type == DeviceType::NvTensorRtRtx || type == DeviceType::CUDA);
   try {
 #if defined(_WIN32)
     static LibraryHandle library{"onnxruntime-genai-cuda.dll"};
@@ -190,8 +191,10 @@ DeviceInterface* GetCudaInterface() {
     if (!library)
       throw std::runtime_error("Shared library load failure (see first error)");
 
-    Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai);
-    static DeviceInterface* cuda_interface = reinterpret_cast<decltype(&GetInterface)>(library.GetSymbol("GetInterface"))(&g_genai);
+    Generators::DeviceInterface* GetInterface(GenaiInterface * p_genai, const char* deviceType);
+    static DeviceInterface* cuda_interface =
+        reinterpret_cast<decltype(&GetInterface)>(
+            library.GetSymbol("GetInterface"))(&g_genai, to_string(type).c_str());
 
     return cuda_interface;
   } catch (const std::exception& e) {
@@ -213,6 +216,8 @@ std::string to_string(DeviceType device_type) {
       return "QnnWithSharedMemory";
     case DeviceType::OpenVINO:
       return "OpenVINO";
+    case DeviceType::NvTensorRtRtx:
+      return "NvTensorRtRtx";
     default:
       throw std::runtime_error("Unknown device type");
   }
@@ -224,7 +229,8 @@ DeviceInterface* GetDeviceInterface(DeviceType type) {
     case DeviceType::CPU:
       return GetCpuInterface();
     case DeviceType::CUDA:
-      return GetCudaInterface();
+    case DeviceType::NvTensorRtRtx:
+      return GetCudaInterface(type);
 #if USE_DML
     case DeviceType::DML:
       return GetDmlInterface();
diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -483,14 +483,10 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
         session_options.AddConfigEntry("session.inter_op.allow_spinning", "0");
         session_options.AddConfigEntry("session.intra_op.allow_spinning", "0");
       } else if (provider_options.name == "NvTensorRtRtx") {
-        // After setting the NvTensorRtRtx provider in Onnxruntime, GenAI will then treat it as the cuda device.
-        session_options.AddConfigEntry("ep.nvtensorrtrtxexecutionprovider.nv_cuda_graph_enable", "1");
-
         if (IsMultiProfileEnabled(config.model.decoder.session_options)) {
           ConfigureMultiProfile(config, session_options);
         }
-
-        p_device = GetDeviceInterface(DeviceType::CUDA);
+        p_device = GetDeviceInterface(DeviceType::NvTensorRtRtx);
       }
 
       std::vector<const char*> keys, values;
@@ -536,7 +532,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config) {
   // This ensures memory allocated on-device for model inputs/outputs is valid for the lifetime of GenAI.
 
   // Names for the device types used by 'SetProviderSessionOptions'
-  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)"};
+  static const char* device_type_names[] = {"CPU (Not used, see above)", "cuda", "DML", "WebGPU", "QNN", "OpenVINO (Not used, see above)", "NvTensorRtRtx"};
   static_assert(std::size(device_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Create an OrtSessionOptions and set the options to use the DeviceType we're using here
@@ -555,7 +551,7 @@ void EnsureDeviceOrtInit(DeviceInterface& device, const Config& config) {
   allocator.session_ = OrtSession::Create(GetOrtEnv(), g_trivial_model, sizeof(g_trivial_model), session_options.get());
 
   // Names for the device memory types used by 'OrtMemoryInfo::Create'
-  static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)"};
+  static const char* device_memory_type_names[] = {"CPU (Not used, see above)", "Cuda", "DML", "WebGPU_Buffer", "QnnHtpShared", "OpenVINO (Not used, see above)", "Cuda"};
   static_assert(std::size(device_memory_type_names) == static_cast<size_t>(DeviceType::MAX));
 
   // Get the allocator from the OrtSession for the DeviceType (it's called 'AllocatorCreate' but it's really 'AllocatorGet')
diff --git a/src/smartptrs.h b/src/smartptrs.h
@@ -91,6 +91,7 @@ enum struct DeviceType {
   WEBGPU,
   QNN,
   OpenVINO,
+  NvTensorRtRtx,
   MAX
 };