NvTensorRTRTx: Enable CUDA graph via config and fix attention_mask shape handling (#1594)

anujj · web-flow · commit 7adf3b7cccef · 2025-07-03T20:46:10.000-07:00
- Add option to enable CUDA graph for NvTensorRTRTx EP through provider config. - Fix handling of attention_mask shapes when `enable_cuda_graph` is false for NvTensorRTRTx: - When `past_present_share_buffer` (in place kv cache) is enabled, NvTensorRTRTx expects attention_mask shape as `[b, max_seq_len]` with masking applied. Previously, these shapes were only sent when both `past_present_share_buffer` and graph capture were enabled. This PR ensures the correct shape is passed to TRT for in-place KV cache, aligning with expected behavior. @baijumeswani for review
diff --git a/src/config.cpp b/src/config.cpp
@@ -828,7 +828,7 @@ void SetProviderOption(Config& config, std::string_view provider_name, std::stri
   JSON::Parse(element, json.str());
 }
 
-bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {
+bool IsGraphCaptureEnabled(const Config::SessionOptions& session_options) {
   for (const auto& provider : session_options.providers) {
     const auto provider_options = std::find_if(session_options.provider_options.begin(),
                                                session_options.provider_options.end(),
@@ -846,7 +846,12 @@ bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {
       } else if (provider_options->name == "DML") {
         return true;
       } else if (provider_options->name == "NvTensorRtRtx") {
-        return true;
+        for (const auto& value : provider_options->options) {
+          if (value.first == "enable_cuda_graph" && value.second == "1") {
+            return true;
+          }
+        }
+        return false;
       }
     }
   }
diff --git a/src/config.h b/src/config.h
@@ -276,7 +276,7 @@ void SetSearchBool(Config::Search& search, std::string_view name, bool value);
 void ClearProviders(Config& config);
 void SetProviderOption(Config& config, std::string_view provider_name, std::string_view option_name, std::string_view option_value);
 void OverlayConfig(Config& config, std::string_view json);
-bool IsGraphCaptureEnabled(Config::SessionOptions& session_options);
+bool IsGraphCaptureEnabled(const Config::SessionOptions& session_options);
 bool IsMultiProfileEnabled(const Config::SessionOptions& session_options);
 
 }  // namespace Generators
diff --git a/src/models/model.cpp b/src/models/model.cpp
@@ -541,6 +541,9 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,
         if (IsMultiProfileEnabled(config.model.decoder.session_options)) {
           ConfigureMultiProfile(config, session_options);
         }
+        if (IsGraphCaptureEnabled(config.model.decoder.session_options)) {
+          session_options.AddConfigEntry("ep.nvtensorrtrtxexecutionprovider.nv_cuda_graph_enable", "1");
+        }
         p_device = GetDeviceInterface(DeviceType::NvTensorRtRtx);
       }
 
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
@@ -141,7 +141,7 @@ void DefaultPositionInputs::UpdatePositionIDs(int total_length, int new_kv_lengt
 }
 
 void DefaultPositionInputs::CreateNextAttentionMaskTensor(int total_length) {
-  if (state_.params_->use_graph_capture)
+  if (ShouldUseStaticMaskHandling())
     return;
   attention_mask_shape_[1] = total_length;
   attention_mask_next_->CreateTensor(attention_mask_shape_);
@@ -154,26 +154,26 @@ void DefaultPositionInputs::UpdateAttentionMask(int total_length, int new_kv_len
   CreateNextAttentionMaskTensor(total_length);
 
   // Update the attention mask on the device. If it fails, copy to CPU, update there, and copy back to device.
-  if (!model_.p_device_inputs_->UpdateAttentionMask(state_.params_->use_graph_capture ? nullptr : attention_mask_next_->GetMutableRawData(),
+  if (!model_.p_device_inputs_->UpdateAttentionMask(ShouldUseStaticMaskHandling() ? nullptr : attention_mask_next_->GetMutableRawData(),
                                                     attention_mask_->GetMutableRawData(),
                                                     static_cast<int>(attention_mask_shape_[0]),
                                                     new_kv_length,
                                                     total_length,
                                                     state_.params_->search.max_length,
-                                                    state_.params_->use_graph_capture,
+                                                    ShouldUseStaticMaskHandling(),
                                                     type_)) {
     // auto* attention_mask_next_span = state_.params_->use_graph_capture ? &attention_mask_next_->GetByteSpan() : nullptr;
     DeviceSpan<uint8_t> attention_mask_next_span;
-    if (!state_.params_->use_graph_capture)
+    if (!ShouldUseStaticMaskHandling())
       attention_mask_next_span = attention_mask_next_->GetByteSpan();
     auto attention_mask_span = attention_mask_->GetByteSpan();
-    GetDeviceInterface(DeviceType::CPU)->UpdateAttentionMask(state_.params_->use_graph_capture ? nullptr : attention_mask_next_span.CopyDeviceToCpu().data(), attention_mask_span.CopyDeviceToCpu().data(), static_cast<int>(attention_mask_shape_[0]), new_kv_length, total_length, state_.params_->search.max_length, state_.params_->use_graph_capture, type_);
-    if (!state_.params_->use_graph_capture)
+    GetDeviceInterface(DeviceType::CPU)->UpdateAttentionMask(ShouldUseStaticMaskHandling() ? nullptr : attention_mask_next_span.CopyDeviceToCpu().data(), attention_mask_span.CopyDeviceToCpu().data(), static_cast<int>(attention_mask_shape_[0]), new_kv_length, total_length, state_.params_->search.max_length, ShouldUseStaticMaskHandling(), type_);
+    if (!ShouldUseStaticMaskHandling())
       attention_mask_next_span.CopyCpuToDevice();
     attention_mask_span.CopyCpuToDevice();
   }
 
-  if (!state_.params_->use_graph_capture) {
+  if (!ShouldUseStaticMaskHandling()) {
     attention_mask_->ort_tensor_ = std::move(attention_mask_next_->ort_tensor_);
     state_.inputs_[mask_input_index_] = attention_mask_->GetOrtTensor();
   }
@@ -256,7 +256,7 @@ void DefaultPositionInputs::CreateAndInitializeAttentionMask(DeviceSpan<int32_t>
     }
   }
 
-  if (state_.params_->use_graph_capture) {
+  if (ShouldUseStaticMaskHandling()) {
     InitializeStaticMask<T>(*attention_mask);
   } else {
     attention_mask = model_.ExpandInputs(attention_mask, state_.params_->search.num_beams);
@@ -291,6 +291,12 @@ void DefaultPositionInputs::RewindMask(size_t index) {
   }
 }
 
+bool DefaultPositionInputs::ShouldUseStaticMaskHandling() const {
+  return state_.params_->use_graph_capture ||
+         (state_.params_->search.past_present_share_buffer &&
+          model_.p_device_->GetType() == DeviceType::NvTensorRtRtx);
+}
+
 // TODO: SlidingWindow does not support graph capture
 WindowedPositionInputs::WindowedPositionInputs(State& state)
     : state_{state} {
diff --git a/src/models/position_inputs.h b/src/models/position_inputs.h
@@ -38,6 +38,12 @@ struct DefaultPositionInputs : PositionInputs {
 
   void RewindMask(size_t index);
 
+  // This returns true when either:
+  // 1. Graph capture is enabled, OR
+  // 2. Past-present buffer sharing is enabled AND the device is NvTensorRtRtx
+  // Both scenarios require static mask allocation and special shape handling for optimization
+  bool ShouldUseStaticMaskHandling() const;
+
   const Model& model_;
   State& state_;
   std::string attention_mask_name_;

Original file line number	Diff line number	Diff line change
`@@ -828,7 +828,7 @@ void SetProviderOption(Config& config, std::string_view provider_name, std::stri`
`828`	`828`	`JSON::Parse(element, json.str());`
`829`	`829`	`}`
`830`	`830`
`831`		`-bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {`
	`831`	`+bool IsGraphCaptureEnabled(const Config::SessionOptions& session_options) {`
`832`	`832`	`for (const auto& provider : session_options.providers) {`
`833`	`833`	`const auto provider_options = std::find_if(session_options.provider_options.begin(),`
`834`	`834`	`session_options.provider_options.end(),`
`@@ -846,7 +846,12 @@ bool IsGraphCaptureEnabled(Config::SessionOptions& session_options) {`
`846`	`846`	`} else if (provider_options->name == "DML") {`
`847`	`847`	`return true;`
`848`	`848`	`} else if (provider_options->name == "NvTensorRtRtx") {`
`849`		`- return true;`
	`849`	`+ for (const auto& value : provider_options->options) {`
	`850`	`+ if (value.first == "enable_cuda_graph" && value.second == "1") {`
	`851`	`+ return true;`
	`852`	`+ }`
	`853`	`+ }`
	`854`	`+ return false;`
`850`	`855`	`}`
`851`	`856`	`}`
`852`	`857`	`}`
Original file line number	Diff line number	Diff line change
`@@ -541,6 +541,9 @@ DeviceInterface* SetProviderSessionOptions(OrtSessionOptions& session_options,`
`541`	`541`	`if (IsMultiProfileEnabled(config.model.decoder.session_options)) {`
`542`	`542`	`ConfigureMultiProfile(config, session_options);`
`543`	`543`	`}`
	`544`	`+ if (IsGraphCaptureEnabled(config.model.decoder.session_options)) {`
	`545`	`+ session_options.AddConfigEntry("ep.nvtensorrtrtxexecutionprovider.nv_cuda_graph_enable", "1");`
	`546`	`+ }`
`544`	`547`	`p_device = GetDeviceInterface(DeviceType::NvTensorRtRtx);`
`545`	`548`	`}`
`546`	`549`