From 6f6c44e07bd0eecc401ce14eccf64234681fd440 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 01:14:41 +0000 Subject: [PATCH 01/50] pass TRT graph state up and dwon call stack and acache in RFDetrObjDetTRT class --- .../inference_models/models/common/trt.py | 9 +++++ .../rfdetr/rfdetr_object_detection_trt.py | 35 +++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index d692f567a9..3b86dfef86 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -1,4 +1,5 @@ from typing import List, Tuple +from dataclasses import dataclass import torch @@ -57,6 +58,14 @@ def log(self, severity: trt.ILogger.Severity, msg: str) -> None: def get_memory(self) -> List[Tuple[trt.ILogger.Severity, str]]: return self._memory +import pycuda.driver as cuda +@dataclass +class TRTCudaGraphState: + cuda_graph: cuda.GraphExec + cuda_stream: torch.cuda.Stream + input_pointer: int + output_pointers: List[int] + def get_trt_engine_inputs_and_outputs( engine: trt.ICudaEngine, diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index b3833fc4f8..a250bed25f 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -36,6 +36,7 @@ get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, + TRTCudaGraphState, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -197,19 +198,33 @@ def pre_process( ) def forward( - self, pre_processed_images: torch.Tensor, **kwargs + self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs ) -> Tuple[torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): - detections, labels = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - ) + if use_cuda_graph: + detections, labels, trt_cuda_graph_state = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + use_cuda_graph=True, + ) + self._trt_cuda_graph_state = trt_cuda_graph_state + else: + detections, labels = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + use_cuda_graph=False, + ) return detections, labels def post_process( From 549ca10bc2789424de5671bd0b2e0f24a04e1aa1 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 01:34:53 +0000 Subject: [PATCH 02/50] actually passing it up and down the stack --- .../inference_models/models/common/trt.py | 118 ++++++++++++++++-- .../rfdetr/rfdetr_object_detection_trt.py | 4 +- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 3b86dfef86..fead8056af 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Tuple, Optional from dataclasses import dataclass import torch @@ -64,8 +64,12 @@ class TRTCudaGraphState: cuda_graph: cuda.GraphExec cuda_stream: torch.cuda.Stream input_pointer: int + input_shape: Tuple[int, ...] output_pointers: List[int] + output_shapes: List[Tuple[int, ...]] + def has_changed_shape(self, input_shape: Tuple[int, ...], output_shapes: List[Tuple[int, ...]]) -> bool: + return self.input_shape != input_shape or self.output_shapes != output_shapes def get_trt_engine_inputs_and_outputs( engine: trt.ICudaEngine, @@ -137,7 +141,9 @@ def infer_from_trt_engine( device: torch.device, input_name: str, outputs: List[str], -) -> List[torch.Tensor]: + use_cuda_graph: bool = False, + trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, +) -> Tuple[List[torch.Tensor], List[torch.Tensor], TRTCudaGraphState]: """Run inference using a TensorRT engine. Executes inference on preprocessed images using a TensorRT engine and execution @@ -239,6 +245,8 @@ def infer_from_trt_engine( outputs=outputs, min_batch_size=trt_config.static_batch_size, max_batch_size=trt_config.static_batch_size, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=trt_cuda_graph_state, ) return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, @@ -249,6 +257,8 @@ def infer_from_trt_engine( outputs=outputs, min_batch_size=trt_config.dynamic_batch_size_min, max_batch_size=trt_config.dynamic_batch_size_max, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=trt_cuda_graph_state, ) @@ -261,7 +271,9 @@ def infer_from_trt_engine_with_batch_size_boundaries( outputs: List[str], min_batch_size: int, max_batch_size: int, -) -> List[torch.Tensor]: + use_cuda_graph: bool = False, + trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, +) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: if pre_processed_images.shape[0] <= max_batch_size: reminder = min_batch_size - pre_processed_images.shape[0] if reminder > 0: @@ -276,17 +288,19 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results = execute_trt_engine( + results, trt_cuda_graph_state = execute_trt_engine( pre_processed_images=pre_processed_images, engine=engine, context=context, device=device, input_name=input_name, outputs=outputs, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=trt_cuda_graph_state, ) if reminder > 0: results = [r[:-reminder] for r in results] - return results + return results, trt_cuda_graph_state all_results = [] for _ in outputs: all_results.append([]) @@ -305,19 +319,21 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results = execute_trt_engine( + results, trt_cuda_graph_state = execute_trt_engine( pre_processed_images=batch, engine=engine, context=context, device=device, input_name=input_name, outputs=outputs, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=trt_cuda_graph_state, ) if reminder > 0: results = [r[:-reminder] for r in results] for partial_result, all_result_element in zip(results, all_results): all_result_element.append(partial_result) - return [torch.cat(e, dim=0).contiguous() for e in all_results] + return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_state def execute_trt_engine( @@ -327,7 +343,91 @@ def execute_trt_engine( device: torch.device, input_name: str, outputs: List[str], -) -> List[torch.Tensor]: + use_cuda_graph: bool = False, + trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: + + if use_cuda_graph: + + batch_size = pre_processed_images.shape[0] + results = [] + for output in outputs: + output_tensor_shape = engine.get_tensor_shape(output) + output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) + result = torch.empty( + (batch_size,) + output_tensor_shape[1:], + dtype=output_tensor_type, + device=device, + ) + context.set_tensor_address(output, result.data_ptr()) + results.append(result) + status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) + if not status: + raise ModelRuntimeError( + message="Failed to set TRT model input shape during forward pass from the model.", + help_url="https://todo", + ) + status = context.set_tensor_address(input_name, pre_processed_images.data_ptr()) + if not status: + raise ModelRuntimeError( + message="Failed to set input tensor data pointer during forward pass from the model.", + help_url="https://todo", + ) + stream = torch.cuda.Stream(device=device) + status = context.execute_async_v3(stream_handle=stream.cuda_stream) + if not status: + raise ModelRuntimeError( + message="Failed to complete inference from TRT model", + help_url="https://todo", + ) + stream.synchronize() + return results, None + + else: + + batch_size = pre_processed_images.shape[0] + results = [] + for output in outputs: + output_tensor_shape = engine.get_tensor_shape(output) + output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) + result = torch.empty( + (batch_size,) + output_tensor_shape[1:], + dtype=output_tensor_type, + device=device, + ) + context.set_tensor_address(output, result.data_ptr()) + results.append(result) + status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) + if not status: + raise ModelRuntimeError( + message="Failed to set TRT model input shape during forward pass from the model.", + help_url="https://todo", + ) + status = context.set_tensor_address(input_name, pre_processed_images.data_ptr()) + if not status: + raise ModelRuntimeError( + message="Failed to set input tensor data pointer during forward pass from the model.", + help_url="https://todo", + ) + stream = torch.cuda.Stream(device=device) + status = context.execute_async_v3(stream_handle=stream.cuda_stream) + if not status: + raise ModelRuntimeError( + message="Failed to complete inference from TRT model", + help_url="https://todo", + ) + stream.synchronize() + return results, None + + +def execute_trt_engine_with_cuda_graph( + pre_processed_images: torch.Tensor, + engine: trt.ICudaEngine, + context: trt.IExecutionContext, + device: torch.device, + input_name: str, + outputs: List[str], +) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: batch_size = pre_processed_images.shape[0] results = [] for output in outputs: @@ -360,7 +460,7 @@ def execute_trt_engine( help_url="https://todo", ) stream.synchronize() - return results + return results, None def trt_dtype_to_torch(trt_dtype): diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index a250bed25f..469ea0fd3c 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -177,6 +177,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config + self._trt_cuda_graph_state = None self._lock = threading.Lock() @property @@ -212,10 +213,11 @@ def forward( input_name=self._input_name, outputs=self._output_names, use_cuda_graph=True, + trt_cuda_graph_state=self._trt_cuda_graph_state, ) self._trt_cuda_graph_state = trt_cuda_graph_state else: - detections, labels = infer_from_trt_engine( + detections, labels, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, From 6412efef3c55b22b743498cce77da494a63d464d Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 02:14:29 +0000 Subject: [PATCH 03/50] three-branch solution --- .../profile_rfdetr_trt_cudagraphs.py | 70 +++++++++ .../inference_models/models/common/trt.py | 135 ++++++++++-------- .../rfdetr/rfdetr_object_detection_trt.py | 4 +- 3 files changed, 150 insertions(+), 59 deletions(-) create mode 100644 inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py new file mode 100644 index 0000000000..89fce9a0da --- /dev/null +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -0,0 +1,70 @@ +import os +import time + +import cv2 +import torch +from tqdm import tqdm + +from inference_models import AutoModel + +IMAGE_PATH_WARMUP = "/home/mkaic/inference/tests/inference/unit_tests/core/utils/assets/1.jpg" +# IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP +IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg" +DEVICE = os.environ.get("DEVICE", "cuda:0") +CYCLES = 500 +WARMUP = 50 + + +def main() -> None: + image = cv2.imread(IMAGE_PATH_WARMUP) + model = AutoModel.from_pretrained( + model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt" + ) + + image_warmup = cv2.imread(IMAGE_PATH_WARMUP) + pre_processed_warmup, metadata = model.pre_process(image_warmup) + print(f"Pre-processed image shape: {pre_processed_warmup.shape}") + + print(f"Warming up ({WARMUP} iterations each)...") + for _ in range(WARMUP): + model.forward(pre_processed_warmup, use_cuda_graph=False) + model.forward(pre_processed_warmup, use_cuda_graph=True) + # torch.cuda.synchronize() + + print(f"Profiling ({CYCLES} iterations each)...") + image_profiling = cv2.imread(IMAGE_PATH_PROFILING) + pre_processed_profiling, metadata = model.pre_process(image_profiling) + print(f"Pre-processed image shape: {pre_processed_profiling.shape}") + + start = time.perf_counter() + for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"): + model.forward(pre_processed_profiling, use_cuda_graph=False) + # torch.cuda.synchronize() + baseline_fps = CYCLES / (time.perf_counter() - start) + + start = time.perf_counter() + for _ in tqdm(range(CYCLES), desc="With CUDA graphs"): + model.forward(pre_processed_profiling, use_cuda_graph=True) + # torch.cuda.synchronize() + cudagraph_fps = CYCLES / (time.perf_counter() - start) + + result_baseline = model.forward(pre_processed_profiling, use_cuda_graph=False) + result_cudagraph = model.forward(pre_processed_profiling, use_cuda_graph=True) + # torch.cuda.synchronize() + + print(f"Result baseline: {result_baseline}") + print(f"Result cudagraph: {result_cudagraph}") + + dets_match = torch.allclose(result_baseline[0], result_cudagraph[0], atol=1e-4) + labels_match = torch.allclose(result_baseline[1], result_cudagraph[1], atol=1e-4) + + print(f"\n{'='*50}") + print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}") + print(f"Forward pass FPS (CUDA graphs): {cudagraph_fps:.1f}") + print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x") + print(f"Outputs match: dets={dets_match}, labels={labels_match}") + print(f"{'='*50}") + + +if __name__ == "__main__": + main() diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index fead8056af..96924da147 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -58,18 +58,17 @@ def log(self, severity: trt.ILogger.Severity, msg: str) -> None: def get_memory(self) -> List[Tuple[trt.ILogger.Severity, str]]: return self._memory -import pycuda.driver as cuda + @dataclass class TRTCudaGraphState: - cuda_graph: cuda.GraphExec + cuda_graph: torch.cuda.CUDAGraph cuda_stream: torch.cuda.Stream - input_pointer: int - input_shape: Tuple[int, ...] - output_pointers: List[int] - output_shapes: List[Tuple[int, ...]] + input_buffer: torch.Tensor + output_buffers: List[torch.Tensor] + + def has_changed_shape(self, input_shape: Tuple[int, ...]) -> bool: + return tuple(self.input_buffer.shape) != input_shape - def has_changed_shape(self, input_shape: Tuple[int, ...], output_shapes: List[Tuple[int, ...]]) -> bool: - return self.input_shape != input_shape or self.output_shapes != output_shapes def get_trt_engine_inputs_and_outputs( engine: trt.ICudaEngine, @@ -143,7 +142,7 @@ def infer_from_trt_engine( outputs: List[str], use_cuda_graph: bool = False, trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], List[torch.Tensor], TRTCudaGraphState]: +) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: """Run inference using a TensorRT engine. Executes inference on preprocessed images using a TensorRT engine and execution @@ -347,44 +346,40 @@ def execute_trt_engine( trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, ) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: - if use_cuda_graph: - - batch_size = pre_processed_images.shape[0] - results = [] - for output in outputs: - output_tensor_shape = engine.get_tensor_shape(output) - output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) - result = torch.empty( - (batch_size,) + output_tensor_shape[1:], - dtype=output_tensor_type, - device=device, - ) - context.set_tensor_address(output, result.data_ptr()) - results.append(result) - status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) - if not status: - raise ModelRuntimeError( - message="Failed to set TRT model input shape during forward pass from the model.", - help_url="https://todo", + if trt_cuda_graph_state is not None: + input_shape = tuple(pre_processed_images.shape) + if trt_cuda_graph_state.has_changed_shape(input_shape): + LOGGER.warning( + f"Input shape changed from {tuple(trt_cuda_graph_state.input_buffer.shape)} " + f"to {input_shape}. Recapturing CUDA graph." ) - status = context.set_tensor_address(input_name, pre_processed_images.data_ptr()) - if not status: - raise ModelRuntimeError( - message="Failed to set input tensor data pointer during forward pass from the model.", - help_url="https://todo", - ) - stream = torch.cuda.Stream(device=device) - status = context.execute_async_v3(stream_handle=stream.cuda_stream) - if not status: - raise ModelRuntimeError( - message="Failed to complete inference from TRT model", - help_url="https://todo", + return _capture_cuda_graph( + pre_processed_images=pre_processed_images, + engine=engine, + context=context, + device=device, + input_name=input_name, + outputs=outputs, ) + + stream = trt_cuda_graph_state.cuda_stream + trt_cuda_graph_state.input_buffer.copy_(pre_processed_images) + trt_cuda_graph_state.cuda_graph.replay() stream.synchronize() - return results, None + results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] + return results, trt_cuda_graph_state - else: + elif use_cuda_graph: + return _capture_cuda_graph( + pre_processed_images=pre_processed_images, + engine=engine, + context=context, + device=device, + input_name=input_name, + outputs=outputs, + ) + else: batch_size = pre_processed_images.shape[0] results = [] for output in outputs: @@ -420,7 +415,7 @@ def execute_trt_engine( return results, None -def execute_trt_engine_with_cuda_graph( +def _capture_cuda_graph( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, context: trt.IExecutionContext, @@ -429,38 +424,64 @@ def execute_trt_engine_with_cuda_graph( outputs: List[str], ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: batch_size = pre_processed_images.shape[0] - results = [] + + input_buffer = torch.empty_like(pre_processed_images, device=device) + input_buffer.copy_(pre_processed_images) + + output_buffers = [] for output in outputs: output_tensor_shape = engine.get_tensor_shape(output) output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) - result = torch.empty( + output_buffer = torch.empty( (batch_size,) + output_tensor_shape[1:], dtype=output_tensor_type, device=device, ) - context.set_tensor_address(output, result.data_ptr()) - results.append(result) + context.set_tensor_address(output, output_buffer.data_ptr()) + output_buffers.append(output_buffer) + status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( - message="Failed to set TRT model input shape during forward pass from the model.", + message="Failed to set TRT model input shape during CUDA graph capture.", help_url="https://todo", ) - status = context.set_tensor_address(input_name, pre_processed_images.data_ptr()) + status = context.set_tensor_address(input_name, input_buffer.data_ptr()) if not status: raise ModelRuntimeError( - message="Failed to set input tensor data pointer during forward pass from the model.", + message="Failed to set input tensor data pointer during CUDA graph capture.", help_url="https://todo", ) + stream = torch.cuda.Stream(device=device) - status = context.execute_async_v3(stream_handle=stream.cuda_stream) - if not status: - raise ModelRuntimeError( - message="Failed to complete inference from TRT model", - help_url="https://todo", - ) + with torch.cuda.stream(stream): + status = context.execute_async_v3(stream_handle=stream.cuda_stream) + if not status: + raise ModelRuntimeError( + message="Failed to execute TRT model warmup before CUDA graph capture.", + help_url="https://todo", + ) stream.synchronize() - return results, None + + cuda_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(cuda_graph, stream=stream): + status = context.execute_async_v3(stream_handle=stream.cuda_stream) + if not status: + raise ModelRuntimeError( + message="Failed to capture CUDA graph from TRT model execution.", + help_url="https://todo", + ) + + + trt_cuda_graph_state = TRTCudaGraphState( + cuda_graph=cuda_graph, + cuda_stream=stream, + input_buffer=input_buffer, + output_buffers=output_buffers, + ) + + results = [buf.clone() for buf in output_buffers] + return results, trt_cuda_graph_state def trt_dtype_to_torch(trt_dtype): diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index 469ea0fd3c..ec5dc4cd21 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -204,7 +204,7 @@ def forward( with self._lock: with use_cuda_context(context=self._cuda_context): if use_cuda_graph: - detections, labels, trt_cuda_graph_state = infer_from_trt_engine( + (detections, labels), trt_cuda_graph_state = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -217,7 +217,7 @@ def forward( ) self._trt_cuda_graph_state = trt_cuda_graph_state else: - detections, labels, _ = infer_from_trt_engine( + (detections, labels), _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, From 08888e396682c7e24e29b007c1355f670c6b2c5b Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 04:00:34 +0000 Subject: [PATCH 04/50] avoid breaking things due to chagne in infer_with_trt_engine API --- .../inference_models/models/common/trt.py | 17 ++++++---- .../rfdetr_instance_segmentation_trt.py | 3 +- .../rfdetr/rfdetr_object_detection_trt.py | 33 +++++++------------ .../yolact_instance_segmentation_trt.py | 2 +- .../yolonas/yolonas_object_detection_trt.py | 2 +- .../yolov5_instance_segmentation_trt.py | 2 +- .../yolov7_instance_segmentation_trt.py | 2 +- .../yolov8_instance_segmentation_trt.py | 2 +- 8 files changed, 28 insertions(+), 35 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 96924da147..19767e89c0 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -142,7 +142,7 @@ def infer_from_trt_engine( outputs: List[str], use_cuda_graph: bool = False, trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: """Run inference using a TensorRT engine. Executes inference on preprocessed images using a TensorRT engine and execution @@ -272,7 +272,7 @@ def infer_from_trt_engine_with_batch_size_boundaries( max_batch_size: int, use_cuda_graph: bool = False, trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: if pre_processed_images.shape[0] <= max_batch_size: reminder = min_batch_size - pre_processed_images.shape[0] if reminder > 0: @@ -363,10 +363,12 @@ def execute_trt_engine( ) stream = trt_cuda_graph_state.cuda_stream - trt_cuda_graph_state.input_buffer.copy_(pre_processed_images) - trt_cuda_graph_state.cuda_graph.replay() + with torch.cuda.stream(stream): + trt_cuda_graph_state.input_buffer.copy_(pre_processed_images) + trt_cuda_graph_state.cuda_graph.replay() + results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] stream.synchronize() - results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] + return results, trt_cuda_graph_state elif use_cuda_graph: @@ -471,8 +473,10 @@ def _capture_cuda_graph( message="Failed to capture CUDA graph from TRT model execution.", help_url="https://todo", ) + with torch.cuda.stream(stream): + results = [buf.clone() for buf in output_buffers] + stream.synchronize() - trt_cuda_graph_state = TRTCudaGraphState( cuda_graph=cuda_graph, cuda_stream=stream, @@ -480,7 +484,6 @@ def _capture_cuda_graph( output_buffers=output_buffers, ) - results = [buf.clone() for buf in output_buffers] return results, trt_cuda_graph_state diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index c4e9223023..78e2be9d50 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -33,6 +33,7 @@ get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, + TRTCudaGraphState, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -198,7 +199,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): - detections, labels, masks = infer_from_trt_engine( + (detections, labels, masks), _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index ec5dc4cd21..b65d03348d 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -203,30 +203,19 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): + (detections, labels), trt_cuda_graph_state = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None, + ) if use_cuda_graph: - (detections, labels), trt_cuda_graph_state = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - use_cuda_graph=True, - trt_cuda_graph_state=self._trt_cuda_graph_state, - ) self._trt_cuda_graph_state = trt_cuda_graph_state - else: - (detections, labels), _ = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - use_cuda_graph=False, - ) return detections, labels def post_process( diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py index cc3cbeedaf..ea6ebe6cf0 100644 --- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py @@ -192,7 +192,7 @@ def forward( all_proto_data, ) = ([], [], [], [], []) for image in pre_processed_images: - loc_data, conf_data, mask_data, prior_data, proto_data = ( + (loc_data, conf_data, mask_data, prior_data, proto_data), _ = ( infer_from_trt_engine( pre_processed_images=image.unsqueeze(0).contiguous(), trt_config=self._trt_config, diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py index 6f561d58e4..39822ff34b 100644 --- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py +++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py @@ -187,7 +187,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - results = infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py index ee7180e10d..a18b743b90 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py @@ -187,7 +187,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - instances, protos = infer_from_trt_engine( + (instances, protos), _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py index 15ef8a13ee..abcc82a78c 100644 --- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py @@ -185,7 +185,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - instances, protos = infer_from_trt_engine( + (instances, protos), _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py index 8a32a117ae..f2cf1d7953 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py @@ -195,7 +195,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - instances, protos = infer_from_trt_engine( + (instances, protos), _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, From adda4aa825d8d5443dac365357e251b7b6d5776f Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 04:33:48 +0000 Subject: [PATCH 05/50] update unpacking in the rest of the TRT.py files --- .../profile_rfdetr_trt_cudagraphs.py | 38 +++++++++++++------ .../deep_lab_v3_plus_segmentation_trt.py | 5 ++- .../resnet/resnet_classification_trt.py | 10 +++-- .../models/vit/vit_classification_trt.py | 10 +++-- .../yolov10/yolov10_object_detection_trt.py | 5 ++- .../yolov5/yolov5_object_detection_trt.py | 5 ++- .../yolov8/yolov8_key_points_detection_trt.py | 5 ++- .../yolov8/yolov8_object_detection_trt.py | 5 ++- 8 files changed, 53 insertions(+), 30 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 89fce9a0da..27218b0c08 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -11,14 +11,14 @@ # IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg" DEVICE = os.environ.get("DEVICE", "cuda:0") -CYCLES = 500 +CYCLES = 10_000 WARMUP = 50 def main() -> None: image = cv2.imread(IMAGE_PATH_WARMUP) model = AutoModel.from_pretrained( - model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt" + model_id_or_path="rfdetr-seg-preview", device=torch.device(DEVICE), backend="trt" ) image_warmup = cv2.imread(IMAGE_PATH_WARMUP) @@ -48,21 +48,35 @@ def main() -> None: # torch.cuda.synchronize() cudagraph_fps = CYCLES / (time.perf_counter() - start) - result_baseline = model.forward(pre_processed_profiling, use_cuda_graph=False) - result_cudagraph = model.forward(pre_processed_profiling, use_cuda_graph=True) - # torch.cuda.synchronize() - - print(f"Result baseline: {result_baseline}") - print(f"Result cudagraph: {result_cudagraph}") - - dets_match = torch.allclose(result_baseline[0], result_cudagraph[0], atol=1e-4) - labels_match = torch.allclose(result_baseline[1], result_cudagraph[1], atol=1e-4) + expected_warmup = model.forward(pre_processed_warmup, use_cuda_graph=False) + expected_profiling = model.forward(pre_processed_profiling, use_cuda_graph=False) + + print("Testing for race conditions (alternating inputs 20 times)...") + all_match = True + for i in range(20): + if i % 2 == 0: + result = model.forward(pre_processed_warmup, use_cuda_graph=True) + expected = expected_warmup + img_name = "warmup" + else: + result = model.forward(pre_processed_profiling, use_cuda_graph=True) + expected = expected_profiling + img_name = "profiling" + + dets_match = torch.allclose(result[0], expected[0], atol=1e-6) + labels_match = torch.allclose(result[1], expected[1], atol=1e-6) + if not (dets_match and labels_match): + print(f" MISMATCH at iteration {i} ({img_name}): dets={dets_match}, labels={labels_match}") + all_match = False + + if all_match: + print(" All 20 iterations matched expected outputs.") print(f"\n{'='*50}") print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}") print(f"Forward pass FPS (CUDA graphs): {cudagraph_fps:.1f}") print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x") - print(f"Outputs match: dets={dets_match}, labels={labels_match}") + print(f"Race condition test: {'PASSED' if all_match else 'FAILED'}") print(f"{'='*50}") diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py index 9976a86761..050908b1fe 100644 --- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py +++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py @@ -185,7 +185,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -193,7 +193,8 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py index 34de7058e3..e0b2621a55 100644 --- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py +++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py @@ -185,7 +185,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -193,7 +193,8 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, @@ -335,7 +336,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -343,7 +344,8 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py index d04a90607e..3a0892a8c5 100644 --- a/inference_models/inference_models/models/vit/vit_classification_trt.py +++ b/inference_models/inference_models/models/vit/vit_classification_trt.py @@ -183,7 +183,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -191,7 +191,8 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, @@ -331,7 +332,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -339,7 +340,8 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py index ec2dcf3cdb..ff25e019c2 100644 --- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py @@ -177,7 +177,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -185,7 +185,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py index ff7d376e07..a423033cba 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py @@ -175,7 +175,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -183,7 +183,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py index 898beebb04..12e8630c1f 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py @@ -210,7 +210,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -218,7 +218,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index aef07c3fad..29b9d7bfe8 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -183,7 +183,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( + results, _ = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -191,7 +191,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - )[0] + ) + return results[0] def post_process( self, From 97fdcf0e54d4541e640110e6fbbd024a7296bfc6 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 04:51:47 +0000 Subject: [PATCH 06/50] clean up profiling script --- .../profile_rfdetr_trt_cudagraphs.py | 92 ++++++++++--------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 27218b0c08..c60ecea6c5 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -2,81 +2,87 @@ import time import cv2 +import numpy as np import torch from tqdm import tqdm from inference_models import AutoModel -IMAGE_PATH_WARMUP = "/home/mkaic/inference/tests/inference/unit_tests/core/utils/assets/1.jpg" -# IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP -IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg" +IMAGE_PATH_WARMUP = os.environ.get("IMAGE_PATH_WARMUP", None) +IMAGE_PATH_PROFILING = os.environ.get("IMAGE_PATH_PROFILING", None) DEVICE = os.environ.get("DEVICE", "cuda:0") -CYCLES = 10_000 -WARMUP = 50 +CYCLES = int(os.environ.get("CYCLES", "10_000")) +WARMUP = int(os.environ.get("WARMUP", "50")) def main() -> None: - image = cv2.imread(IMAGE_PATH_WARMUP) + model = AutoModel.from_pretrained( - model_id_or_path="rfdetr-seg-preview", device=torch.device(DEVICE), backend="trt" + model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt" ) - image_warmup = cv2.imread(IMAGE_PATH_WARMUP) - pre_processed_warmup, metadata = model.pre_process(image_warmup) - print(f"Pre-processed image shape: {pre_processed_warmup.shape}") + if IMAGE_PATH_WARMUP is not None: + image_warmup = cv2.imread(IMAGE_PATH_WARMUP) + else: + image_warmup = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + + if IMAGE_PATH_PROFILING is not None: + image_profiling = cv2.imread(IMAGE_PATH_PROFILING) + else: + image_profiling = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + + pre_processed_warmup, _ = model.pre_process(image_warmup) + pre_processed_profiling, _ = model.pre_process(image_profiling) - print(f"Warming up ({WARMUP} iterations each)...") for _ in range(WARMUP): model.forward(pre_processed_warmup, use_cuda_graph=False) model.forward(pre_processed_warmup, use_cuda_graph=True) - # torch.cuda.synchronize() - print(f"Profiling ({CYCLES} iterations each)...") - image_profiling = cv2.imread(IMAGE_PATH_PROFILING) - pre_processed_profiling, metadata = model.pre_process(image_profiling) - print(f"Pre-processed image shape: {pre_processed_profiling.shape}") + expected_output_warmup_image = model.forward( + pre_processed_warmup, use_cuda_graph=False + ) + expected_output_profiling_image = model.forward( + pre_processed_profiling, use_cuda_graph=False + ) + + cudagraph_output_warmup_image = model.forward( + pre_processed_warmup, use_cuda_graph=True + ) + cudagraph_output_profiling_image = model.forward( + pre_processed_profiling, use_cuda_graph=True + ) + + assert torch.allclose( + expected_output_warmup_image[0], cudagraph_output_warmup_image[0], atol=1e-6 + ) + assert torch.allclose( + expected_output_profiling_image[0], + cudagraph_output_profiling_image[0], + atol=1e-6, + ) + assert torch.allclose( + expected_output_warmup_image[1], cudagraph_output_warmup_image[1], atol=1e-6 + ) + assert torch.allclose( + expected_output_profiling_image[1], + cudagraph_output_profiling_image[1], + atol=1e-6, + ) start = time.perf_counter() for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"): model.forward(pre_processed_profiling, use_cuda_graph=False) - # torch.cuda.synchronize() baseline_fps = CYCLES / (time.perf_counter() - start) start = time.perf_counter() for _ in tqdm(range(CYCLES), desc="With CUDA graphs"): model.forward(pre_processed_profiling, use_cuda_graph=True) - # torch.cuda.synchronize() cudagraph_fps = CYCLES / (time.perf_counter() - start) - expected_warmup = model.forward(pre_processed_warmup, use_cuda_graph=False) - expected_profiling = model.forward(pre_processed_profiling, use_cuda_graph=False) - - print("Testing for race conditions (alternating inputs 20 times)...") - all_match = True - for i in range(20): - if i % 2 == 0: - result = model.forward(pre_processed_warmup, use_cuda_graph=True) - expected = expected_warmup - img_name = "warmup" - else: - result = model.forward(pre_processed_profiling, use_cuda_graph=True) - expected = expected_profiling - img_name = "profiling" - - dets_match = torch.allclose(result[0], expected[0], atol=1e-6) - labels_match = torch.allclose(result[1], expected[1], atol=1e-6) - if not (dets_match and labels_match): - print(f" MISMATCH at iteration {i} ({img_name}): dets={dets_match}, labels={labels_match}") - all_match = False - - if all_match: - print(" All 20 iterations matched expected outputs.") - print(f"\n{'='*50}") print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}") print(f"Forward pass FPS (CUDA graphs): {cudagraph_fps:.1f}") print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x") - print(f"Race condition test: {'PASSED' if all_match else 'FAILED'}") print(f"{'='*50}") From 470addb4816a2b1e938b7922f07ac3f87e8a716a Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 05:19:53 +0000 Subject: [PATCH 07/50] remove tqdm from profiling script --- .../profile_rfdetr_trt_cudagraphs.py | 74 +++++++------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index c60ecea6c5..5728b6eb4a 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -4,12 +4,11 @@ import cv2 import numpy as np import torch -from tqdm import tqdm from inference_models import AutoModel -IMAGE_PATH_WARMUP = os.environ.get("IMAGE_PATH_WARMUP", None) -IMAGE_PATH_PROFILING = os.environ.get("IMAGE_PATH_PROFILING", None) +IMAGE_1 = os.environ.get("IMAGE_PATH_WARMUP", None) +IMAGE_2 = os.environ.get("IMAGE_PATH_PROFILING", None) DEVICE = os.environ.get("DEVICE", "cuda:0") CYCLES = int(os.environ.get("CYCLES", "10_000")) WARMUP = int(os.environ.get("WARMUP", "50")) @@ -21,62 +20,45 @@ def main() -> None: model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt" ) - if IMAGE_PATH_WARMUP is not None: - image_warmup = cv2.imread(IMAGE_PATH_WARMUP) + if IMAGE_1 is not None: + image_1 = cv2.imread(IMAGE_1) else: - image_warmup = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + image_1 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) - if IMAGE_PATH_PROFILING is not None: - image_profiling = cv2.imread(IMAGE_PATH_PROFILING) + if IMAGE_2 is not None: + image_2 = cv2.imread(IMAGE_2) else: - image_profiling = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + image_2 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) - pre_processed_warmup, _ = model.pre_process(image_warmup) - pre_processed_profiling, _ = model.pre_process(image_profiling) + pre_processed_1, _ = model.pre_process(image_1) + pre_processed_2, _ = model.pre_process(image_2) - for _ in range(WARMUP): - model.forward(pre_processed_warmup, use_cuda_graph=False) - model.forward(pre_processed_warmup, use_cuda_graph=True) - expected_output_warmup_image = model.forward( - pre_processed_warmup, use_cuda_graph=False - ) - expected_output_profiling_image = model.forward( - pre_processed_profiling, use_cuda_graph=False - ) + expected_output_1_no_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=False) + expected_output_2_no_cuda_graph = model.forward(pre_processed_2, use_cuda_graph=False) - cudagraph_output_warmup_image = model.forward( - pre_processed_warmup, use_cuda_graph=True - ) - cudagraph_output_profiling_image = model.forward( - pre_processed_profiling, use_cuda_graph=True - ) + expected_output_1_capture_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=True) + expected_output_2_capture_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True) - assert torch.allclose( - expected_output_warmup_image[0], cudagraph_output_warmup_image[0], atol=1e-6 - ) - assert torch.allclose( - expected_output_profiling_image[0], - cudagraph_output_profiling_image[0], - atol=1e-6, - ) - assert torch.allclose( - expected_output_warmup_image[1], cudagraph_output_warmup_image[1], atol=1e-6 - ) - assert torch.allclose( - expected_output_profiling_image[1], - cudagraph_output_profiling_image[1], - atol=1e-6, - ) + expected_output_1_replayed_cudagraph = model.forward(pre_processed_1, use_cuda_graph=True) + expected_output_2_replayed_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True) + + for i in [0, 1]: + assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_capture_cuda_graph[i], atol=1e-6) + assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_capture_cudagraph[i], atol=1e-6) + assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_replayed_cudagraph[i], atol=1e-6) + assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_replayed_cudagraph[i], atol=1e-6) + print("Timing without CUDA graphs...") start = time.perf_counter() - for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"): - model.forward(pre_processed_profiling, use_cuda_graph=False) + for _ in range(CYCLES): + model.forward(pre_processed_2, use_cuda_graph=False) baseline_fps = CYCLES / (time.perf_counter() - start) + print("Timing with CUDA graphs...") start = time.perf_counter() - for _ in tqdm(range(CYCLES), desc="With CUDA graphs"): - model.forward(pre_processed_profiling, use_cuda_graph=True) + for _ in range(CYCLES): + model.forward(pre_processed_2, use_cuda_graph=True) cudagraph_fps = CYCLES / (time.perf_counter() - start) print(f"\n{'='*50}") From 8cca2648109878d43376aff05e05bddcb3494c78 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 05:24:33 +0000 Subject: [PATCH 08/50] format --- .../profile_rfdetr_trt_cudagraphs.py | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 5728b6eb4a..9f74e75ef2 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -33,21 +33,48 @@ def main() -> None: pre_processed_1, _ = model.pre_process(image_1) pre_processed_2, _ = model.pre_process(image_2) + expected_output_1_no_cuda_graph = model.forward( + pre_processed_1, use_cuda_graph=False + ) + expected_output_2_no_cuda_graph = model.forward( + pre_processed_2, use_cuda_graph=False + ) - expected_output_1_no_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=False) - expected_output_2_no_cuda_graph = model.forward(pre_processed_2, use_cuda_graph=False) - - expected_output_1_capture_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=True) - expected_output_2_capture_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True) + expected_output_1_capture_cuda_graph = model.forward( + pre_processed_1, use_cuda_graph=True + ) + expected_output_2_capture_cudagraph = model.forward( + pre_processed_2, use_cuda_graph=True + ) - expected_output_1_replayed_cudagraph = model.forward(pre_processed_1, use_cuda_graph=True) - expected_output_2_replayed_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True) + expected_output_1_replayed_cudagraph = model.forward( + pre_processed_1, use_cuda_graph=True + ) + expected_output_2_replayed_cudagraph = model.forward( + pre_processed_2, use_cuda_graph=True + ) for i in [0, 1]: - assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_capture_cuda_graph[i], atol=1e-6) - assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_capture_cudagraph[i], atol=1e-6) - assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_replayed_cudagraph[i], atol=1e-6) - assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_replayed_cudagraph[i], atol=1e-6) + assert torch.allclose( + expected_output_1_no_cuda_graph[i], + expected_output_1_capture_cuda_graph[i], + atol=1e-6, + ) + assert torch.allclose( + expected_output_2_no_cuda_graph[i], + expected_output_2_capture_cudagraph[i], + atol=1e-6, + ) + assert torch.allclose( + expected_output_1_no_cuda_graph[i], + expected_output_1_replayed_cudagraph[i], + atol=1e-6, + ) + assert torch.allclose( + expected_output_2_no_cuda_graph[i], + expected_output_2_replayed_cudagraph[i], + atol=1e-6, + ) print("Timing without CUDA graphs...") start = time.perf_counter() From 5b7d0a56e336b9c71b99355014a526b87dca472f Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 06:11:07 +0000 Subject: [PATCH 09/50] allow flag to be passed to rfdetr-seg models even though there don't seem to be TRT packages for them yet. --- .../models/rfdetr/rfdetr_instance_segmentation_trt.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 78e2be9d50..752fe0ad82 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -172,6 +172,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config + self._trt_cuda_graph_state = None self._lock = threading.Lock() @property @@ -195,11 +196,11 @@ def pre_process( ) def forward( - self, pre_processed_images: torch.Tensor, **kwargs + self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): - (detections, labels, masks), _ = infer_from_trt_engine( + (detections, labels, masks), trt_cuda_graph_state = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -207,7 +208,11 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, + use_cuda_graph=use_cuda_graph, + trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None, ) + if use_cuda_graph: + self._trt_cuda_graph_state = trt_cuda_graph_state return detections, labels, masks def post_process( From a27ae376909abb3df1b3443ddf579d72ff5b4596 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 16:41:01 +0000 Subject: [PATCH 10/50] reduce number of diffed files --- .../inference_models/models/common/trt.py | 72 ++++++++++++++++--- .../deep_lab_v3_plus_segmentation_trt.py | 5 +- .../resnet/resnet_classification_trt.py | 10 ++- .../rfdetr_instance_segmentation_trt.py | 35 +++++---- .../rfdetr/rfdetr_object_detection_trt.py | 35 +++++---- .../models/vit/vit_classification_trt.py | 10 ++- .../yolact_instance_segmentation_trt.py | 2 +- .../yolonas/yolonas_object_detection_trt.py | 2 +- .../yolov10/yolov10_object_detection_trt.py | 5 +- .../yolov5_instance_segmentation_trt.py | 2 +- .../yolov5/yolov5_object_detection_trt.py | 5 +- .../yolov7_instance_segmentation_trt.py | 2 +- .../yolov8_instance_segmentation_trt.py | 2 +- .../yolov8/yolov8_key_points_detection_trt.py | 5 +- .../yolov8/yolov8_object_detection_trt.py | 5 +- 15 files changed, 133 insertions(+), 64 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 19767e89c0..6301a68a8c 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -140,9 +140,7 @@ def infer_from_trt_engine( device: torch.device, input_name: str, outputs: List[str], - use_cuda_graph: bool = False, - trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: +) -> List[torch.Tensor]: """Run inference using a TensorRT engine. Executes inference on preprocessed images using a TensorRT engine and execution @@ -235,7 +233,7 @@ def infer_from_trt_engine( - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names """ if trt_config.static_batch_size is not None: - return infer_from_trt_engine_with_batch_size_boundaries( + results, _ = _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -244,10 +242,68 @@ def infer_from_trt_engine( outputs=outputs, min_batch_size=trt_config.static_batch_size, max_batch_size=trt_config.static_batch_size, - use_cuda_graph=use_cuda_graph, + use_cuda_graph=False, + trt_cuda_graph_state=None, + ) + return results + results, _ = _infer_from_trt_engine_with_batch_size_boundaries( + pre_processed_images=pre_processed_images, + engine=engine, + context=context, + device=device, + input_name=input_name, + outputs=outputs, + min_batch_size=trt_config.dynamic_batch_size_min, + max_batch_size=trt_config.dynamic_batch_size_max, + use_cuda_graph=False, + trt_cuda_graph_state=None, + ) + return results + + +def infer_from_trt_engine_with_cudagraph( + pre_processed_images: torch.Tensor, + trt_config: TRTConfig, + engine: trt.ICudaEngine, + context: trt.IExecutionContext, + device: torch.device, + input_name: str, + outputs: List[str], + trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: + """Run inference using a TensorRT engine with CUDA graph support. + + Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for + improved performance on repeated inference with the same input shape. + + Args: + pre_processed_images: Preprocessed input tensor on CUDA device. + trt_config: TensorRT configuration object. + engine: TensorRT CUDA engine (ICudaEngine). + context: TensorRT execution context (IExecutionContext). + device: PyTorch CUDA device. + input_name: Name of the input tensor in the TensorRT engine. + outputs: List of output tensor names. + trt_cuda_graph_state: Optional state from a previous call for graph replay. + + Returns: + Tuple of (results, trt_cuda_graph_state) where results is the list of + output tensors and trt_cuda_graph_state can be passed to subsequent calls. + """ + if trt_config.static_batch_size is not None: + return _infer_from_trt_engine_with_batch_size_boundaries( + pre_processed_images=pre_processed_images, + engine=engine, + context=context, + device=device, + input_name=input_name, + outputs=outputs, + min_batch_size=trt_config.static_batch_size, + max_batch_size=trt_config.static_batch_size, + use_cuda_graph=True, trt_cuda_graph_state=trt_cuda_graph_state, ) - return infer_from_trt_engine_with_batch_size_boundaries( + return _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -256,12 +312,12 @@ def infer_from_trt_engine( outputs=outputs, min_batch_size=trt_config.dynamic_batch_size_min, max_batch_size=trt_config.dynamic_batch_size_max, - use_cuda_graph=use_cuda_graph, + use_cuda_graph=True, trt_cuda_graph_state=trt_cuda_graph_state, ) -def infer_from_trt_engine_with_batch_size_boundaries( +def _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, context: trt.IExecutionContext, diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py index 050908b1fe..9976a86761 100644 --- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py +++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py @@ -185,7 +185,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -193,8 +193,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py index e0b2621a55..34de7058e3 100644 --- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py +++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py @@ -185,7 +185,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -193,8 +193,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, @@ -336,7 +335,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -344,8 +343,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 752fe0ad82..30d3533199 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -32,6 +32,7 @@ from inference_models.models.common.trt import ( get_trt_engine_inputs_and_outputs, infer_from_trt_engine, + infer_from_trt_engine_with_cudagraph, load_trt_model, TRTCudaGraphState, ) @@ -200,19 +201,29 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): - (detections, labels, masks), trt_cuda_graph_state = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - use_cuda_graph=use_cuda_graph, - trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None, - ) if use_cuda_graph: - self._trt_cuda_graph_state = trt_cuda_graph_state + (detections, labels, masks), self._trt_cuda_graph_state = ( + infer_from_trt_engine_with_cudagraph( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_state=self._trt_cuda_graph_state, + ) + ) + else: + detections, labels, masks = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + ) return detections, labels, masks def post_process( diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index b65d03348d..c5db6bbcf4 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -35,6 +35,7 @@ from inference_models.models.common.trt import ( get_trt_engine_inputs_and_outputs, infer_from_trt_engine, + infer_from_trt_engine_with_cudagraph, load_trt_model, TRTCudaGraphState, ) @@ -203,19 +204,29 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._lock: with use_cuda_context(context=self._cuda_context): - (detections, labels), trt_cuda_graph_state = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - use_cuda_graph=use_cuda_graph, - trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None, - ) if use_cuda_graph: - self._trt_cuda_graph_state = trt_cuda_graph_state + (detections, labels), self._trt_cuda_graph_state = ( + infer_from_trt_engine_with_cudagraph( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_state=self._trt_cuda_graph_state, + ) + ) + else: + detections, labels = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + ) return detections, labels def post_process( diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py index 3a0892a8c5..d04a90607e 100644 --- a/inference_models/inference_models/models/vit/vit_classification_trt.py +++ b/inference_models/inference_models/models/vit/vit_classification_trt.py @@ -183,7 +183,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -191,8 +191,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, @@ -332,7 +331,7 @@ def forward( ) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -340,8 +339,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py index ea6ebe6cf0..cc3cbeedaf 100644 --- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py @@ -192,7 +192,7 @@ def forward( all_proto_data, ) = ([], [], [], [], []) for image in pre_processed_images: - (loc_data, conf_data, mask_data, prior_data, proto_data), _ = ( + loc_data, conf_data, mask_data, prior_data, proto_data = ( infer_from_trt_engine( pre_processed_images=image.unsqueeze(0).contiguous(), trt_config=self._trt_config, diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py index 39822ff34b..6f561d58e4 100644 --- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py +++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py @@ -187,7 +187,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + results = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py index ff25e019c2..ec2dcf3cdb 100644 --- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py @@ -177,7 +177,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -185,8 +185,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py index a18b743b90..ee7180e10d 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py @@ -187,7 +187,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - (instances, protos), _ = infer_from_trt_engine( + instances, protos = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py index a423033cba..ff7d376e07 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py @@ -175,7 +175,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -183,8 +183,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py index abcc82a78c..15ef8a13ee 100644 --- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py @@ -185,7 +185,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - (instances, protos), _ = infer_from_trt_engine( + instances, protos = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py index f2cf1d7953..8a32a117ae 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py @@ -195,7 +195,7 @@ def forward( ) -> Tuple[torch.Tensor, torch.Tensor]: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - (instances, protos), _ = infer_from_trt_engine( + instances, protos = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py index 12e8630c1f..898beebb04 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py @@ -210,7 +210,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -218,8 +218,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index 29b9d7bfe8..aef07c3fad 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -183,7 +183,7 @@ def pre_process( def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: with self._lock: with use_cuda_context(context=self._cuda_context): - results, _ = infer_from_trt_engine( + return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, @@ -191,8 +191,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: device=self._device, input_name=self._input_name, outputs=self._output_names, - ) - return results[0] + )[0] def post_process( self, From 04c015aef0adf53c9779424eae078ccd73f218f9 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 16:53:14 +0000 Subject: [PATCH 11/50] don't rename existing function --- inference_models/inference_models/models/common/trt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 6301a68a8c..b836131abe 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -233,7 +233,7 @@ def infer_from_trt_engine( - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names """ if trt_config.static_batch_size is not None: - results, _ = _infer_from_trt_engine_with_batch_size_boundaries( + results, _ = infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -246,7 +246,7 @@ def infer_from_trt_engine( trt_cuda_graph_state=None, ) return results - results, _ = _infer_from_trt_engine_with_batch_size_boundaries( + results, _ = infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -291,7 +291,7 @@ def infer_from_trt_engine_with_cudagraph( output tensors and trt_cuda_graph_state can be passed to subsequent calls. """ if trt_config.static_batch_size is not None: - return _infer_from_trt_engine_with_batch_size_boundaries( + return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -303,7 +303,7 @@ def infer_from_trt_engine_with_cudagraph( use_cuda_graph=True, trt_cuda_graph_state=trt_cuda_graph_state, ) - return _infer_from_trt_engine_with_batch_size_boundaries( + return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -317,7 +317,7 @@ def infer_from_trt_engine_with_cudagraph( ) -def _infer_from_trt_engine_with_batch_size_boundaries( +def infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, context: trt.IExecutionContext, From ac50a1a8044fd600d9c331064d6d21487bd9054b Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 20:56:44 +0000 Subject: [PATCH 12/50] add proper integration test and simplify profiling script --- .../profile_rfdetr_trt_cudagraphs.py | 66 +++----------- .../integration_tests/models/conftest.py | 10 +++ .../models/test_rfdetr_predictions_trt.py | 86 +++++++++++++++++++ 3 files changed, 106 insertions(+), 56 deletions(-) create mode 100644 inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 9f74e75ef2..e63f5afd8b 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -7,8 +7,7 @@ from inference_models import AutoModel -IMAGE_1 = os.environ.get("IMAGE_PATH_WARMUP", None) -IMAGE_2 = os.environ.get("IMAGE_PATH_PROFILING", None) +IMAGE_PATH = os.environ.get("IMAGE_PATH", None) DEVICE = os.environ.get("DEVICE", "cuda:0") CYCLES = int(os.environ.get("CYCLES", "10_000")) WARMUP = int(os.environ.get("WARMUP", "50")) @@ -20,72 +19,27 @@ def main() -> None: model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt" ) - if IMAGE_1 is not None: - image_1 = cv2.imread(IMAGE_1) + if IMAGE_PATH is not None: + image = cv2.imread(IMAGE_PATH) else: - image_1 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) - if IMAGE_2 is not None: - image_2 = cv2.imread(IMAGE_2) - else: - image_2 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) - - pre_processed_1, _ = model.pre_process(image_1) - pre_processed_2, _ = model.pre_process(image_2) - - expected_output_1_no_cuda_graph = model.forward( - pre_processed_1, use_cuda_graph=False - ) - expected_output_2_no_cuda_graph = model.forward( - pre_processed_2, use_cuda_graph=False - ) - - expected_output_1_capture_cuda_graph = model.forward( - pre_processed_1, use_cuda_graph=True - ) - expected_output_2_capture_cudagraph = model.forward( - pre_processed_2, use_cuda_graph=True - ) - - expected_output_1_replayed_cudagraph = model.forward( - pre_processed_1, use_cuda_graph=True - ) - expected_output_2_replayed_cudagraph = model.forward( - pre_processed_2, use_cuda_graph=True - ) + pre_processed, _ = model.pre_process(image) - for i in [0, 1]: - assert torch.allclose( - expected_output_1_no_cuda_graph[i], - expected_output_1_capture_cuda_graph[i], - atol=1e-6, - ) - assert torch.allclose( - expected_output_2_no_cuda_graph[i], - expected_output_2_capture_cudagraph[i], - atol=1e-6, - ) - assert torch.allclose( - expected_output_1_no_cuda_graph[i], - expected_output_1_replayed_cudagraph[i], - atol=1e-6, - ) - assert torch.allclose( - expected_output_2_no_cuda_graph[i], - expected_output_2_replayed_cudagraph[i], - atol=1e-6, - ) + for _ in range(WARMUP): + model.forward(pre_processed, use_cuda_graph=False) + model.forward(pre_processed, use_cuda_graph=True) print("Timing without CUDA graphs...") start = time.perf_counter() for _ in range(CYCLES): - model.forward(pre_processed_2, use_cuda_graph=False) + model.forward(pre_processed, use_cuda_graph=False) baseline_fps = CYCLES / (time.perf_counter() - start) print("Timing with CUDA graphs...") start = time.perf_counter() for _ in range(CYCLES): - model.forward(pre_processed_2, use_cuda_graph=True) + model.forward(pre_processed, use_cuda_graph=True) cudagraph_fps = CYCLES / (time.perf_counter() - start) print(f"\n{'='*50}") diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py index 5d7bbcddf4..71ca66f6db 100644 --- a/inference_models/tests/integration_tests/models/conftest.py +++ b/inference_models/tests/integration_tests/models/conftest.py @@ -164,6 +164,8 @@ "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/sam2.zip" ) +RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip" + @pytest.fixture(scope="module") def original_clip_download_dir() -> str: @@ -388,6 +390,14 @@ def coin_counting_rfdetr_nano_torch_static_crop_center_crop_package() -> str: ) +@pytest.fixture(scope="module") +def rfdetr_nano_t4_trt_package() -> str: + return download_model_package( + model_package_zip_url=RFDETR_NANO_T4_TRT_PACKAGE_URL, + package_name="rfdetr-nano-t4-trt", + ) + + @pytest.fixture(scope="module") def og_rfdetr_base_weights() -> str: package_path = os.path.join(MODELS_DIR, "og-rfdetr-base") diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py new file mode 100644 index 0000000000..e50d6cd030 --- /dev/null +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -0,0 +1,86 @@ +import logging + +import numpy as np +import pytest +import torch + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_cudagraph_output_matches_non_cudagraph_output( + rfdetr_nano_t4_trt_package: str, + dog_image_numpy: np.ndarray, + bike_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + model = AutoModel.from_pretrained( + model_id_or_path=rfdetr_nano_t4_trt_package, + device=torch.device("cuda:0"), + ) + + pre_processed_1, _ = model.pre_process(dog_image_numpy) + pre_processed_2, _ = model.pre_process(bike_image_numpy) + + outputs = [] + for pre_processed in [pre_processed_1, pre_processed_2]: + no_graph = model.forward(pre_processed, use_cuda_graph=False) + model._trt_cuda_graph_state = None + capture_graph = model.forward(pre_processed, use_cuda_graph=True) + replay_graph = model.forward(pre_processed, use_cuda_graph=True) + + outputs.append((no_graph, capture_graph, replay_graph)) + + for image_outputs in outputs: + no_graph, capture_graph, replay_graph = image_outputs + for result_idx in range(2): + assert torch.allclose( + no_graph[result_idx], + capture_graph[result_idx], + atol=1e-6, + ) + assert torch.allclose( + no_graph[result_idx], + replay_graph[result_idx], + atol=1e-6, + ) + + # make sure that the allcloses aren't true because of buffer aliasing or something weird + # outputs should be different between images and the same between execution branches. + for execution_branch_idx in range(3): + for result_idx in range(2): + assert not torch.allclose( + outputs[0][execution_branch_idx][result_idx], + outputs[1][execution_branch_idx][result_idx], + atol=1e-6, + ) + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_outputs_shapes( + rfdetr_nano_t4_trt_package: str, + dog_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + model = AutoModel.from_pretrained( + model_id_or_path=rfdetr_nano_t4_trt_package, + device=torch.device("cuda:0"), + ) + + pre_processed, _ = model.pre_process(dog_image_numpy) + + output = model.forward(pre_processed, use_cuda_graph=False) + + assert output[0].shape == (1, 300, 4) + assert output[1].shape == (1, 300, 91) + + output = model.forward(pre_processed, use_cuda_graph=True) # capture + + assert output[0].shape == (1, 300, 4) + assert output[1].shape == (1, 300, 91) + + output = model.forward(pre_processed, use_cuda_graph=True) # replay + + assert output[0].shape == (1, 300, 4) + assert output[1].shape == (1, 300, 91) \ No newline at end of file From 951222950742ba91f1856285a980031b7858e4bb Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Fri, 23 Jan 2026 22:36:43 +0000 Subject: [PATCH 13/50] profile how long it takes to capture cuda graph --- .../profile_rfdetr_trt_cudagraphs.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index e63f5afd8b..14044e849c 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -4,12 +4,13 @@ import cv2 import numpy as np import torch +from tqdm import tqdm from inference_models import AutoModel IMAGE_PATH = os.environ.get("IMAGE_PATH", None) DEVICE = os.environ.get("DEVICE", "cuda:0") -CYCLES = int(os.environ.get("CYCLES", "10_000")) +CYCLES = int(os.environ.get("CYCLES", "100")) WARMUP = int(os.environ.get("WARMUP", "50")) @@ -36,16 +37,27 @@ def main() -> None: model.forward(pre_processed, use_cuda_graph=False) baseline_fps = CYCLES / (time.perf_counter() - start) - print("Timing with CUDA graphs...") + print("Timing with forced CUDA graph recapture each step...") + start = time.perf_counter() + for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes + model._trt_cuda_graph_state = None + model.forward(pre_processed, use_cuda_graph=True) + + cudagraph_recapture_fps = CYCLES / (time.perf_counter() - start) + + print("Timing with CUDA graph caching and replaying...") + model.forward(pre_processed, use_cuda_graph=True) # initial capture start = time.perf_counter() for _ in range(CYCLES): model.forward(pre_processed, use_cuda_graph=True) - cudagraph_fps = CYCLES / (time.perf_counter() - start) + cudagraph_replay_fps = CYCLES / (time.perf_counter() - start) print(f"\n{'='*50}") print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}") - print(f"Forward pass FPS (CUDA graphs): {cudagraph_fps:.1f}") - print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x") + print(f"Forward pass FPS (CUDA graphs recapture): {cudagraph_recapture_fps:.1f}") + print(f"Speed factor (recapture): {cudagraph_recapture_fps / baseline_fps:.2f}x") + print(f"Forward pass FPS (CUDA graphs replay): {cudagraph_replay_fps:.1f}") + print(f"Speed factor (replay): {cudagraph_replay_fps / baseline_fps:.2f}x") print(f"{'='*50}") From d5b51f91ff3be38fead38631e899ce2dbd423fc2 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Feb 2026 02:35:49 +0000 Subject: [PATCH 14/50] add LRU (shape, device, dtype) caching for CUDA graphs --- .../inference_models/models/common/trt.py | 121 +++++++++++------- .../rfdetr_instance_segmentation_trt.py | 9 +- .../rfdetr/rfdetr_object_detection_trt.py | 9 +- 3 files changed, 80 insertions(+), 59 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index b836131abe..6683c14cb2 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -1,5 +1,6 @@ from typing import List, Tuple, Optional from dataclasses import dataclass +from collections import OrderedDict import torch @@ -35,7 +36,6 @@ class InferenceTRTLogger(trt.ILogger): - def __init__(self, with_memory: bool = False): super().__init__() self._memory: List[Tuple[trt.ILogger.Severity, str]] = [] @@ -66,8 +66,35 @@ class TRTCudaGraphState: input_buffer: torch.Tensor output_buffers: List[torch.Tensor] - def has_changed_shape(self, input_shape: Tuple[int, ...]) -> bool: - return tuple(self.input_buffer.shape) != input_shape + +class TRTCudaGraphLRUCache: + def __init__(self, capacity: int = 64): + self.cache: OrderedDict[ + Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState + ] = OrderedDict() + self.capacity = capacity + + def __contains__( + self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] + ) -> bool: + return key in self.cache + + def __getitem__( + self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] + ) -> TRTCudaGraphState: + value = self.cache[key] + self.cache.move_to_end(key) + return value + + def __setitem__( + self, + key: Tuple[Tuple[int, ...], torch.dtype, torch.device], + value: TRTCudaGraphState, + ): + self.cache[key] = value + self.cache.move_to_end(key) + if len(self.cache) > self.capacity: + self.cache.popitem(last=False) def get_trt_engine_inputs_and_outputs( @@ -243,7 +270,7 @@ def infer_from_trt_engine( min_batch_size=trt_config.static_batch_size, max_batch_size=trt_config.static_batch_size, use_cuda_graph=False, - trt_cuda_graph_state=None, + trt_cuda_graph_cache=None, ) return results results, _ = infer_from_trt_engine_with_batch_size_boundaries( @@ -256,7 +283,7 @@ def infer_from_trt_engine( min_batch_size=trt_config.dynamic_batch_size_min, max_batch_size=trt_config.dynamic_batch_size_max, use_cuda_graph=False, - trt_cuda_graph_state=None, + trt_cuda_graph_cache=None, ) return results @@ -269,8 +296,8 @@ def infer_from_trt_engine_with_cudagraph( device: torch.device, input_name: str, outputs: List[str], - trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: + trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: """Run inference using a TensorRT engine with CUDA graph support. Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for @@ -284,11 +311,11 @@ def infer_from_trt_engine_with_cudagraph( device: PyTorch CUDA device. input_name: Name of the input tensor in the TensorRT engine. outputs: List of output tensor names. - trt_cuda_graph_state: Optional state from a previous call for graph replay. + trt_cuda_graph_cache: Optional state from a previous call for graph replay. Returns: - Tuple of (results, trt_cuda_graph_state) where results is the list of - output tensors and trt_cuda_graph_state can be passed to subsequent calls. + Tuple of (results, trt_cuda_graph_cache) where results is the list of + output tensors and trt_cuda_graph_cache can be passed to subsequent calls. """ if trt_config.static_batch_size is not None: return infer_from_trt_engine_with_batch_size_boundaries( @@ -301,7 +328,7 @@ def infer_from_trt_engine_with_cudagraph( min_batch_size=trt_config.static_batch_size, max_batch_size=trt_config.static_batch_size, use_cuda_graph=True, - trt_cuda_graph_state=trt_cuda_graph_state, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, @@ -313,7 +340,7 @@ def infer_from_trt_engine_with_cudagraph( min_batch_size=trt_config.dynamic_batch_size_min, max_batch_size=trt_config.dynamic_batch_size_max, use_cuda_graph=True, - trt_cuda_graph_state=trt_cuda_graph_state, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) @@ -327,8 +354,8 @@ def infer_from_trt_engine_with_batch_size_boundaries( min_batch_size: int, max_batch_size: int, use_cuda_graph: bool = False, - trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: + trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: if pre_processed_images.shape[0] <= max_batch_size: reminder = min_batch_size - pre_processed_images.shape[0] if reminder > 0: @@ -343,7 +370,7 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results, trt_cuda_graph_state = execute_trt_engine( + results, trt_cuda_graph_cache = execute_trt_engine( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -351,11 +378,11 @@ def infer_from_trt_engine_with_batch_size_boundaries( input_name=input_name, outputs=outputs, use_cuda_graph=use_cuda_graph, - trt_cuda_graph_state=trt_cuda_graph_state, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) if reminder > 0: results = [r[:-reminder] for r in results] - return results, trt_cuda_graph_state + return results, trt_cuda_graph_cache all_results = [] for _ in outputs: all_results.append([]) @@ -374,7 +401,7 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results, trt_cuda_graph_state = execute_trt_engine( + results, trt_cuda_graph_cache = execute_trt_engine( pre_processed_images=batch, engine=engine, context=context, @@ -382,13 +409,13 @@ def infer_from_trt_engine_with_batch_size_boundaries( input_name=input_name, outputs=outputs, use_cuda_graph=use_cuda_graph, - trt_cuda_graph_state=trt_cuda_graph_state, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) if reminder > 0: results = [r[:-reminder] for r in results] for partial_result, all_result_element in zip(results, all_results): all_result_element.append(partial_result) - return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_state + return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_cache def execute_trt_engine( @@ -399,17 +426,20 @@ def execute_trt_engine( input_name: str, outputs: List[str], use_cuda_graph: bool = False, - trt_cuda_graph_state: Optional[TRTCudaGraphState] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]: + trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, +) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: + if use_cuda_graph: + if trt_cuda_graph_cache is None: + trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=64) - if trt_cuda_graph_state is not None: input_shape = tuple(pre_processed_images.shape) - if trt_cuda_graph_state.has_changed_shape(input_shape): - LOGGER.warning( - f"Input shape changed from {tuple(trt_cuda_graph_state.input_buffer.shape)} " - f"to {input_shape}. Recapturing CUDA graph." - ) - return _capture_cuda_graph( + input_dtype = pre_processed_images.dtype + cache_key = (input_shape, input_dtype, device) + + if cache_key not in trt_cuda_graph_cache: + LOGGER.debug(f"Capturing CUDA graph for shape {input_shape}") + + results, trt_cuda_graph = _capture_cuda_graph( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -417,25 +447,18 @@ def execute_trt_engine( input_name=input_name, outputs=outputs, ) + trt_cuda_graph_cache[cache_key] = trt_cuda_graph + return results, trt_cuda_graph_cache - stream = trt_cuda_graph_state.cuda_stream - with torch.cuda.stream(stream): - trt_cuda_graph_state.input_buffer.copy_(pre_processed_images) - trt_cuda_graph_state.cuda_graph.replay() - results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] - stream.synchronize() - - return results, trt_cuda_graph_state - - elif use_cuda_graph: - return _capture_cuda_graph( - pre_processed_images=pre_processed_images, - engine=engine, - context=context, - device=device, - input_name=input_name, - outputs=outputs, - ) + else: + trt_cuda_graph_state = trt_cuda_graph_cache[cache_key] + stream = trt_cuda_graph_state.cuda_stream + with torch.cuda.stream(stream): + trt_cuda_graph_state.input_buffer.copy_(pre_processed_images) + trt_cuda_graph_state.cuda_graph.replay() + results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] + stream.synchronize() + return results, trt_cuda_graph_cache else: batch_size = pre_processed_images.shape[0] @@ -533,14 +556,14 @@ def _capture_cuda_graph( results = [buf.clone() for buf in output_buffers] stream.synchronize() - trt_cuda_graph_state = TRTCudaGraphState( + trt_cuda_graph_cache = TRTCudaGraphState( cuda_graph=cuda_graph, cuda_stream=stream, input_buffer=input_buffer, output_buffers=output_buffers, ) - return results, trt_cuda_graph_state + return results, trt_cuda_graph_cache def trt_dtype_to_torch(trt_dtype): diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 30d3533199..550dfa3b55 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -34,7 +34,7 @@ infer_from_trt_engine, infer_from_trt_engine_with_cudagraph, load_trt_model, - TRTCudaGraphState, + TRTCudaGraphLRUCache, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -73,7 +73,6 @@ class RFDetrForInstanceSegmentationTRT( Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ] ): - @classmethod def from_pretrained( cls, @@ -173,7 +172,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_state = None + self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None self._lock = threading.Lock() @property @@ -202,7 +201,7 @@ def forward( with self._lock: with use_cuda_context(context=self._cuda_context): if use_cuda_graph: - (detections, labels, masks), self._trt_cuda_graph_state = ( + (detections, labels, masks), self._trt_cuda_graph_cache = ( infer_from_trt_engine_with_cudagraph( pre_processed_images=pre_processed_images, trt_config=self._trt_config, @@ -211,7 +210,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - trt_cuda_graph_state=self._trt_cuda_graph_state, + trt_cuda_graph_cache=self._trt_cuda_graph_cache, ) ) else: diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index c5db6bbcf4..0e12ea5d33 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -37,7 +37,7 @@ infer_from_trt_engine, infer_from_trt_engine_with_cudagraph, load_trt_model, - TRTCudaGraphState, + TRTCudaGraphLRUCache, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -73,7 +73,6 @@ class RFDetrForObjectDetectionTRT( ] ) ): - @classmethod def from_pretrained( cls, @@ -178,7 +177,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_state = None + self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None self._lock = threading.Lock() @property @@ -205,7 +204,7 @@ def forward( with self._lock: with use_cuda_context(context=self._cuda_context): if use_cuda_graph: - (detections, labels), self._trt_cuda_graph_state = ( + (detections, labels), self._trt_cuda_graph_cache = ( infer_from_trt_engine_with_cudagraph( pre_processed_images=pre_processed_images, trt_config=self._trt_config, @@ -214,7 +213,7 @@ def forward( device=self._device, input_name=self._input_name, outputs=self._output_names, - trt_cuda_graph_state=self._trt_cuda_graph_state, + trt_cuda_graph_cache=self._trt_cuda_graph_cache, ) ) else: From dbd45f967e4da25a3c60743c826ad65096173d10 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Feb 2026 03:03:13 +0000 Subject: [PATCH 15/50] add USE_CUDA_GRAPHS_FOR_TRT_BACKEND environment variable which defaults to True and reference in RFDETR TRT classes --- inference_models/inference_models/configuration.py | 5 +++++ .../rfdetr/rfdetr_instance_segmentation_trt.py | 13 +++++++++++-- .../models/rfdetr/rfdetr_object_detection_trt.py | 13 +++++++++++-- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py index e2bd370872..a3170b465d 100644 --- a/inference_models/inference_models/configuration.py +++ b/inference_models/inference_models/configuration.py @@ -83,3 +83,8 @@ ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA = os.getenv( "ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA" ) + +USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env( + variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND", + default=True, +) diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 550dfa3b55..dba576f7b3 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -5,7 +5,10 @@ import torch from inference_models import InstanceDetections, InstanceSegmentationModel -from inference_models.configuration import DEFAULT_DEVICE +from inference_models.configuration import ( + DEFAULT_DEVICE, + USE_CUDA_GRAPHS_FOR_TRT_BACKEND, +) from inference_models.entities import ColorFormat from inference_models.errors import ( CorruptedModelPackageError, @@ -196,8 +199,14 @@ def pre_process( ) def forward( - self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs + self, + pre_processed_images: torch.Tensor, + use_cuda_graph: Optional[bool] = None, + **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + if use_cuda_graph is None: + use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + with self._lock: with use_cuda_context(context=self._cuda_context): if use_cuda_graph: diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index 0e12ea5d33..4503454e00 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -5,7 +5,10 @@ import torch from inference_models import Detections, ObjectDetectionModel -from inference_models.configuration import DEFAULT_DEVICE +from inference_models.configuration import ( + DEFAULT_DEVICE, + USE_CUDA_GRAPHS_FOR_TRT_BACKEND, +) from inference_models.entities import ColorFormat from inference_models.errors import ( CorruptedModelPackageError, @@ -199,8 +202,14 @@ def pre_process( ) def forward( - self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs + self, + pre_processed_images: torch.Tensor, + use_cuda_graph: Optional[bool] = None, + **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + if use_cuda_graph is None: + use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + with self._lock: with use_cuda_context(context=self._cuda_context): if use_cuda_graph: From 9502b8e1c7d7016c25bac98c37314768b60165c0 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Feb 2026 03:10:20 +0000 Subject: [PATCH 16/50] fix bug in profiling script --- .../development/profiling/profile_rfdetr_trt_cudagraphs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 14044e849c..733d462216 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -10,7 +10,7 @@ IMAGE_PATH = os.environ.get("IMAGE_PATH", None) DEVICE = os.environ.get("DEVICE", "cuda:0") -CYCLES = int(os.environ.get("CYCLES", "100")) +CYCLES = int(os.environ.get("CYCLES", "10_000")) WARMUP = int(os.environ.get("WARMUP", "50")) @@ -40,10 +40,10 @@ def main() -> None: print("Timing with forced CUDA graph recapture each step...") start = time.perf_counter() for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes - model._trt_cuda_graph_state = None + model._trt_cuda_graph_cache = None model.forward(pre_processed, use_cuda_graph=True) - cudagraph_recapture_fps = CYCLES / (time.perf_counter() - start) + cudagraph_recapture_fps = 100 / (time.perf_counter() - start) print("Timing with CUDA graph caching and replaying...") model.forward(pre_processed, use_cuda_graph=True) # initial capture From cb705381ee15b31bc889f9d158293e102f63b9ba Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Feb 2026 00:03:12 +0000 Subject: [PATCH 17/50] use yolov8 with dynamic batch size to test shape caching for CUDA graphs --- .../inference_models/models/common/trt.py | 49 ++++++------ .../yolov8/yolov8_object_detection_trt.py | 51 +++++++++--- .../integration_tests/models/conftest.py | 9 +++ .../models/test_rfdetr_predictions_trt.py | 2 +- ...yolov8_object_detection_predictions_trt.py | 77 +++++++++++++++++++ 5 files changed, 150 insertions(+), 38 deletions(-) create mode 100644 inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 6683c14cb2..d7dc7e2155 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -461,18 +461,6 @@ def execute_trt_engine( return results, trt_cuda_graph_cache else: - batch_size = pre_processed_images.shape[0] - results = [] - for output in outputs: - output_tensor_shape = engine.get_tensor_shape(output) - output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) - result = torch.empty( - (batch_size,) + output_tensor_shape[1:], - dtype=output_tensor_type, - device=device, - ) - context.set_tensor_address(output, result.data_ptr()) - results.append(result) status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( @@ -485,6 +473,17 @@ def execute_trt_engine( message="Failed to set input tensor data pointer during forward pass from the model.", help_url="https://todo", ) + results = [] + for output in outputs: + output_tensor_shape = context.get_tensor_shape(output) + output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) + result = torch.empty( + tuple(output_tensor_shape), + dtype=output_tensor_type, + device=device, + ) + context.set_tensor_address(output, result.data_ptr()) + results.append(result) stream = torch.cuda.Stream(device=device) status = context.execute_async_v3(stream_handle=stream.cuda_stream) if not status: @@ -504,23 +503,9 @@ def _capture_cuda_graph( input_name: str, outputs: List[str], ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: - batch_size = pre_processed_images.shape[0] - input_buffer = torch.empty_like(pre_processed_images, device=device) input_buffer.copy_(pre_processed_images) - output_buffers = [] - for output in outputs: - output_tensor_shape = engine.get_tensor_shape(output) - output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) - output_buffer = torch.empty( - (batch_size,) + output_tensor_shape[1:], - dtype=output_tensor_type, - device=device, - ) - context.set_tensor_address(output, output_buffer.data_ptr()) - output_buffers.append(output_buffer) - status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( @@ -534,6 +519,18 @@ def _capture_cuda_graph( help_url="https://todo", ) + output_buffers = [] + for output in outputs: + output_tensor_shape = context.get_tensor_shape(output) + output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) + output_buffer = torch.empty( + tuple(output_tensor_shape), + dtype=output_tensor_type, + device=device, + ) + context.set_tensor_address(output, output_buffer.data_ptr()) + output_buffers.append(output_buffer) + stream = torch.cuda.Stream(device=device) with torch.cuda.stream(stream): status = context.execute_async_v3(stream_handle=stream.cuda_stream) diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index aef07c3fad..f76d922bae 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -5,7 +5,10 @@ import torch from inference_models import Detections, ObjectDetectionModel -from inference_models.configuration import DEFAULT_DEVICE +from inference_models.configuration import ( + DEFAULT_DEVICE, + USE_CUDA_GRAPHS_FOR_TRT_BACKEND, +) from inference_models.entities import ColorFormat from inference_models.errors import ( CorruptedModelPackageError, @@ -35,8 +38,10 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphLRUCache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, + infer_from_trt_engine_with_cudagraph, load_trt_model, ) @@ -160,6 +165,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None self._lock = threading.Lock() @property @@ -180,18 +186,41 @@ def pre_process( input_color_format=input_color_format, ) - def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: + def forward( + self, + pre_processed_images: torch.Tensor, + use_cuda_graph: Optional[bool] = None, + **kwargs, + ) -> torch.Tensor: + if use_cuda_graph is None: + use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + with self._lock: with use_cuda_context(context=self._cuda_context): - return infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - )[0] + if use_cuda_graph: + results, self._trt_cuda_graph_cache = ( + infer_from_trt_engine_with_cudagraph( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_cache=self._trt_cuda_graph_cache, + ) + ) + return results[0] + else: + return infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + )[0] def post_process( self, diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py index 257b506c93..86fe0533df 100644 --- a/inference_models/tests/integration_tests/models/conftest.py +++ b/inference_models/tests/integration_tests/models/conftest.py @@ -184,6 +184,7 @@ ) RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip" +YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip" @pytest.fixture(scope="module") @@ -417,6 +418,14 @@ def rfdetr_nano_t4_trt_package() -> str: ) +@pytest.fixture(scope="module") +def yolov8n_640_t4_trt_package() -> str: + return download_model_package( + model_package_zip_url=YOLOV8N_640_T4_TRT_PACKAGE_URL, + package_name="yolov8n-640-t4-trt", + ) + + @pytest.fixture(scope="module") def og_rfdetr_base_weights() -> str: package_path = os.path.join(MODELS_DIR, "og-rfdetr-base") diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index e50d6cd030..4768fc9043 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -57,7 +57,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( @pytest.mark.slow @pytest.mark.trt_extras -def test_trt_outputs_shapes( +def test_trt_outputs_match_expected_shapes( rfdetr_nano_t4_trt_package: str, dog_image_numpy: np.ndarray, ) -> None: diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py new file mode 100644 index 0000000000..6031df5c6e --- /dev/null +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -0,0 +1,77 @@ +import numpy as np +import pytest +import torch + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( + yolov8n_640_t4_trt_package: str, + dog_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + device = torch.device("cuda:0") + model = AutoModel.from_pretrained( + model_id_or_path=yolov8n_640_t4_trt_package, + device=device, + ) + + pre_processed_single, _ = model.pre_process(dog_image_numpy) + model._trt_cuda_graph_cache = None + + seen_shapes = set() + capture_outputs = {} + test_sequence = [1, 2, 1, 4, 2, 1, 4, 3, 3] + + for batch_size in test_sequence: + batch = pre_processed_single.repeat(batch_size, 1, 1, 1) + cache_key = (tuple(batch.shape), batch.dtype, device) + + cache_before = model._trt_cuda_graph_cache + cache_size_before = len(cache_before.cache) if cache_before is not None else 0 + + output = model.forward(batch, use_cuda_graph=True) + + cache_after = model._trt_cuda_graph_cache + assert cache_after is not None + cache_size_after = len(cache_after.cache) + + if cache_key not in seen_shapes: + assert cache_size_after == cache_size_before + 1 + seen_shapes.add(cache_key) + capture_outputs[cache_key] = output.clone() + continue + + assert cache_size_after == cache_size_before + assert torch.allclose(capture_outputs[cache_key], output, atol=1e-3) + + assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_cudagraph_output_matches_non_cudagraph_output( + yolov8n_640_t4_trt_package: str, + dog_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + device = torch.device("cuda:0") + model = AutoModel.from_pretrained( + model_id_or_path=yolov8n_640_t4_trt_package, + device=device, + ) + pre_processed_single, _ = model.pre_process(dog_image_numpy) + + for batch_size in [1, 4]: + batch = pre_processed_single.repeat(batch_size, 1, 1, 1) + + no_graph = model.forward(batch, use_cuda_graph=False) + + model._trt_cuda_graph_cache = None + capture_graph = model.forward(batch, use_cuda_graph=True) + replay_graph = model.forward(batch, use_cuda_graph=True) + + assert torch.allclose(no_graph, capture_graph, atol=1e-3) + assert torch.allclose(no_graph, replay_graph, atol=1e-3) From 6b1d430a1b7da1419ab6ec146e589a37fee8838c Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Feb 2026 00:27:41 +0000 Subject: [PATCH 18/50] add instance seg tests --- .../integration_tests/models/conftest.py | 9 ++++ .../models/test_rfdetr_seg_predictions_trt.py | 52 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py index 86fe0533df..452840ca1b 100644 --- a/inference_models/tests/integration_tests/models/conftest.py +++ b/inference_models/tests/integration_tests/models/conftest.py @@ -184,6 +184,7 @@ ) RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip" +RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-seg-t4-trt.zip" YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip" @@ -418,6 +419,14 @@ def rfdetr_nano_t4_trt_package() -> str: ) +@pytest.fixture(scope="module") +def rfdetr_seg_nano_t4_trt_package() -> str: + return download_model_package( + model_package_zip_url=RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL, + package_name="rfdetr-seg-nano-t4-trt", + ) + + @pytest.fixture(scope="module") def yolov8n_640_t4_trt_package() -> str: return download_model_package( diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py new file mode 100644 index 0000000000..c5591aab9e --- /dev/null +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -0,0 +1,52 @@ +import numpy as np +import pytest +import torch + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_cudagraph_output_matches_non_cudagraph_output( + rfdetr_seg_nano_t4_trt_package: str, + snake_image_numpy: np.ndarray, + dog_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + model = AutoModel.from_pretrained( + model_id_or_path=rfdetr_seg_nano_t4_trt_package, + device=torch.device("cuda:0"), + ) + + pre_processed_1, _ = model.pre_process(snake_image_numpy) + pre_processed_2, _ = model.pre_process(dog_image_numpy) + + outputs = [] + for pre_processed in [pre_processed_1, pre_processed_2]: + no_graph = model.forward(pre_processed, use_cuda_graph=False) + model._trt_cuda_graph_cache = None + capture_graph = model.forward(pre_processed, use_cuda_graph=True) + replay_graph = model.forward(pre_processed, use_cuda_graph=True) + + outputs.append((no_graph, capture_graph, replay_graph)) + + for image_outputs in outputs: + no_graph, capture_graph, replay_graph = image_outputs + for result_idx in range(3): + assert torch.allclose( + no_graph[result_idx], + capture_graph[result_idx], + atol=1e-6, + ) + assert torch.allclose( + no_graph[result_idx], + replay_graph[result_idx], + atol=1e-6, + ) + + for execution_branch_idx in range(3): + for result_idx in range(3): + assert not torch.allclose( + outputs[0][execution_branch_idx][result_idx], + outputs[1][execution_branch_idx][result_idx], + atol=1e-6, + ) From 7c233004c69e1fb03517c3413a992480a4da19c1 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Feb 2026 00:32:13 +0000 Subject: [PATCH 19/50] update conftest --- inference_models/tests/integration_tests/models/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py index 452840ca1b..4798ee7830 100644 --- a/inference_models/tests/integration_tests/models/conftest.py +++ b/inference_models/tests/integration_tests/models/conftest.py @@ -184,7 +184,7 @@ ) RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip" -RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-seg-t4-trt.zip" +RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-seg-nano-t4-trt.zip" YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip" From a27c80c0092cd1449a82da03674c1d36abdb8693 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Feb 2026 01:07:00 +0000 Subject: [PATCH 20/50] add batch-size-cycling profiling for TRT cudagraphs with yolov8 --- .../profile_yolov8_trt_cudagraphs.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py diff --git a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py new file mode 100644 index 0000000000..9506b6b1ed --- /dev/null +++ b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py @@ -0,0 +1,92 @@ +import os +import time + +import numpy as np +import torch +from tqdm import tqdm + +from inference_models import AutoModel + +DEVICE = os.environ.get("DEVICE", "cuda:0") +CYCLES = int(os.environ.get("CYCLES", "10_000")) +WARMUP = int(os.environ.get("WARMUP", "50")) +RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100")) + +BATCH_SIZES = [1, 2, 3] + + +def main() -> None: + + model = AutoModel.from_pretrained( + model_id_or_path="yolov8n-640", + device=torch.device(DEVICE), + backend="trt", + batch_size=(1, max(BATCH_SIZES)), + ) + + image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8) + pre_processed_single, _ = model.pre_process(image) + + batches = { + bs: pre_processed_single.repeat(bs, 1, 1, 1) for bs in BATCH_SIZES + } + + # ── Warmup ────────────────────────────────────────────────────────── + for _ in range(WARMUP): + for batch in batches.values(): + model.forward(batch, use_cuda_graph=False) + model.forward(batch, use_cuda_graph=True) + + bs_label = "/".join(str(bs) for bs in BATCH_SIZES) + + # ── (1) Cycling batch sizes, no CUDA graphs ───────────────────────── + print(f"Timing without CUDA graphs, cycling bs={bs_label}...") + torch.cuda.synchronize() + start = time.perf_counter() + for i in range(CYCLES): + batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]] + model.forward(batch, use_cuda_graph=False) + torch.cuda.synchronize() + baseline_fps = CYCLES / (time.perf_counter() - start) + + # ── (2) Cycling batch sizes, CUDA graphs with forced recapture ────── + print( + f"Timing with CUDA graph recapture every iteration, cycling bs={bs_label} " + f"({RECAPTURE_CYCLES} iters)..." + ) + torch.cuda.synchronize() + start = time.perf_counter() + for i in range(RECAPTURE_CYCLES): + model._trt_cuda_graph_cache = None + batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]] + model.forward(batch, use_cuda_graph=True) + torch.cuda.synchronize() + recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start) + + # ── (3) Cycling batch sizes, CUDA graphs with normal caching ──────── + model._trt_cuda_graph_cache = None + for batch in batches.values(): + model.forward(batch, use_cuda_graph=True) + + print(f"Timing with CUDA graph cache replay, cycling bs={bs_label}...") + torch.cuda.synchronize() + start = time.perf_counter() + for i in range(CYCLES): + batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]] + model.forward(batch, use_cuda_graph=True) + torch.cuda.synchronize() + replay_fps = CYCLES / (time.perf_counter() - start) + + # ── Results ───────────────────────────────────────────────────────── + print(f"\n{'='*60}") + print(f" yolov8n-640 TRT — cycling batch sizes {BATCH_SIZES}") + print(f" {CYCLES} iterations (recapture: {RECAPTURE_CYCLES})") + print(f"{'='*60}") + print(f" No CUDA graphs: {baseline_fps:>8.1f} fwd/s") + print(f" CUDA graph recapture: {recapture_fps:>8.1f} fwd/s ({recapture_fps / baseline_fps:.2f}x)") + print(f" CUDA graph replay: {replay_fps:>8.1f} fwd/s ({replay_fps / baseline_fps:.2f}x)") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() From 212b2d64231b2f4f01aa883077097037fa9afaf6 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Feb 2026 01:20:41 +0000 Subject: [PATCH 21/50] fix failing test --- .../inference_models/models/common/trt.py | 31 ++++++++++--------- .../rfdetr_instance_segmentation_trt.py | 1 - .../rfdetr/rfdetr_object_detection_trt.py | 1 - .../yolov8/yolov8_object_detection_trt.py | 1 - 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index d7dc7e2155..8d4f1bf260 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -65,6 +65,7 @@ class TRTCudaGraphState: cuda_stream: torch.cuda.Stream input_buffer: torch.Tensor output_buffers: List[torch.Tensor] + execution_context: trt.IExecutionContext class TRTCudaGraphLRUCache: @@ -292,7 +293,6 @@ def infer_from_trt_engine_with_cudagraph( pre_processed_images: torch.Tensor, trt_config: TRTConfig, engine: trt.ICudaEngine, - context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], @@ -307,7 +307,6 @@ def infer_from_trt_engine_with_cudagraph( pre_processed_images: Preprocessed input tensor on CUDA device. trt_config: TensorRT configuration object. engine: TensorRT CUDA engine (ICudaEngine). - context: TensorRT execution context (IExecutionContext). device: PyTorch CUDA device. input_name: Name of the input tensor in the TensorRT engine. outputs: List of output tensor names. @@ -321,7 +320,7 @@ def infer_from_trt_engine_with_cudagraph( return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, - context=context, + context=None, # the graph cache has its own contexts device=device, input_name=input_name, outputs=outputs, @@ -333,7 +332,7 @@ def infer_from_trt_engine_with_cudagraph( return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, - context=context, + context=None, # the graph cache has its own contexts device=device, input_name=input_name, outputs=outputs, @@ -442,7 +441,6 @@ def execute_trt_engine( results, trt_cuda_graph = _capture_cuda_graph( pre_processed_images=pre_processed_images, engine=engine, - context=context, device=device, input_name=input_name, outputs=outputs, @@ -498,21 +496,25 @@ def execute_trt_engine( def _capture_cuda_graph( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, - context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]: + # Each CUDA graph needs its own execution context. Sharing a single context + # across graphs for different input shapes causes TRT to reallocate internal + # workspace buffers, invalidating GPU addresses baked into earlier graphs. + graph_context = engine.create_execution_context() + input_buffer = torch.empty_like(pre_processed_images, device=device) input_buffer.copy_(pre_processed_images) - status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) + status = graph_context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( message="Failed to set TRT model input shape during CUDA graph capture.", help_url="https://todo", ) - status = context.set_tensor_address(input_name, input_buffer.data_ptr()) + status = graph_context.set_tensor_address(input_name, input_buffer.data_ptr()) if not status: raise ModelRuntimeError( message="Failed to set input tensor data pointer during CUDA graph capture.", @@ -521,19 +523,19 @@ def _capture_cuda_graph( output_buffers = [] for output in outputs: - output_tensor_shape = context.get_tensor_shape(output) + output_tensor_shape = graph_context.get_tensor_shape(output) output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output)) output_buffer = torch.empty( tuple(output_tensor_shape), dtype=output_tensor_type, device=device, ) - context.set_tensor_address(output, output_buffer.data_ptr()) + graph_context.set_tensor_address(output, output_buffer.data_ptr()) output_buffers.append(output_buffer) stream = torch.cuda.Stream(device=device) with torch.cuda.stream(stream): - status = context.execute_async_v3(stream_handle=stream.cuda_stream) + status = graph_context.execute_async_v3(stream_handle=stream.cuda_stream) if not status: raise ModelRuntimeError( message="Failed to execute TRT model warmup before CUDA graph capture.", @@ -543,7 +545,7 @@ def _capture_cuda_graph( cuda_graph = torch.cuda.CUDAGraph() with torch.cuda.graph(cuda_graph, stream=stream): - status = context.execute_async_v3(stream_handle=stream.cuda_stream) + status = graph_context.execute_async_v3(stream_handle=stream.cuda_stream) if not status: raise ModelRuntimeError( message="Failed to capture CUDA graph from TRT model execution.", @@ -553,14 +555,15 @@ def _capture_cuda_graph( results = [buf.clone() for buf in output_buffers] stream.synchronize() - trt_cuda_graph_cache = TRTCudaGraphState( + trt_cuda_graph_state = TRTCudaGraphState( cuda_graph=cuda_graph, cuda_stream=stream, input_buffer=input_buffer, output_buffers=output_buffers, + execution_context=graph_context, ) - return results, trt_cuda_graph_cache + return results, trt_cuda_graph_state def trt_dtype_to_torch(trt_dtype): diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index dba576f7b3..745e2c5cd9 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -215,7 +215,6 @@ def forward( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index 4503454e00..d6ac66e84e 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -218,7 +218,6 @@ def forward( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index f76d922bae..ae0cda31fa 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -203,7 +203,6 @@ def forward( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, From 4204f4f5c7e33399636f9ad72d9461a55516b42c Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 00:13:08 +0000 Subject: [PATCH 22/50] first stab at responding to Pawel's feedback --- .../inference_models/models/common/trt.py | 159 +++++++----------- .../rfdetr_instance_segmentation_trt.py | 43 ++--- .../rfdetr/rfdetr_object_detection_trt.py | 43 ++--- .../yolov8/yolov8_object_detection_trt.py | 42 ++--- 4 files changed, 111 insertions(+), 176 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 8d4f1bf260..1e6ca5c7d8 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -164,16 +164,24 @@ def infer_from_trt_engine( pre_processed_images: torch.Tensor, trt_config: TRTConfig, engine: trt.ICudaEngine, - context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], + context: Optional[trt.IExecutionContext] = None, + trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, ) -> List[torch.Tensor]: - """Run inference using a TensorRT engine. + """Run inference using a TensorRT engine, optionally with CUDA graph acceleration. + + Executes inference on preprocessed images using a TensorRT engine. Handles both + static and dynamic batch sizes, automatically splitting large batches if needed. - Executes inference on preprocessed images using a TensorRT engine and execution - context. Handles both static and dynamic batch sizes, automatically splitting - large batches if needed. + When ``trt_cuda_graph_cache`` is provided, CUDA graphs are captured and replayed + for improved performance on repeated inference with the same input shape. Each + graph is keyed by (shape, dtype, device) and stored in the cache. The cache + itself must be created by the caller (typically in the model class). + + When ``trt_cuda_graph_cache`` is ``None``, inference runs through the standard + TRT execution path using the provided ``context``. Args: pre_processed_images: Preprocessed input tensor on CUDA device. @@ -185,6 +193,8 @@ def infer_from_trt_engine( engine: TensorRT CUDA engine (ICudaEngine) to use for inference. context: TensorRT execution context (IExecutionContext) for running inference. + Required when ``trt_cuda_graph_cache`` is ``None``. Ignored when using + CUDA graphs (each cached graph owns its own execution context). device: PyTorch CUDA device to use for inference. @@ -192,12 +202,15 @@ def infer_from_trt_engine( outputs: List of output tensor names to retrieve from the engine. + trt_cuda_graph_cache: Optional CUDA graph cache. When provided, CUDA graphs + are used for inference. When ``None``, standard TRT execution is used. + Returns: List of output tensors from the TensorRT engine, in the order specified by the outputs parameter. Examples: - Run TensorRT inference: + Run TensorRT inference (standard path): >>> from inference_models.developer_tools import ( ... load_trt_model, @@ -228,7 +241,7 @@ def infer_from_trt_engine( ... context=context, ... device=torch.device("cuda:0"), ... input_name=inputs[0], - ... outputs=outputs + ... outputs=outputs, ... ) Handle large batches: @@ -243,10 +256,25 @@ def infer_from_trt_engine( ... context=context, ... device=torch.device("cuda:0"), ... input_name=inputs[0], - ... outputs=outputs + ... outputs=outputs, ... ) >>> # Results are automatically concatenated + Run with CUDA graph acceleration: + + >>> from inference_models.models.common.trt import TRTCudaGraphLRUCache + >>> cache = TRTCudaGraphLRUCache(capacity=16) + >>> + >>> results = infer_from_trt_engine( + ... pre_processed_images=images, + ... trt_config=trt_config, + ... engine=engine, + ... device=torch.device("cuda:0"), + ... input_name=inputs[0], + ... outputs=outputs, + ... trt_cuda_graph_cache=cache, + ... ) + Note: - Requires TensorRT and PyCUDA to be installed - Input must be on CUDA device @@ -261,100 +289,35 @@ def infer_from_trt_engine( - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names """ if trt_config.static_batch_size is not None: - results, _ = infer_from_trt_engine_with_batch_size_boundaries( - pre_processed_images=pre_processed_images, - engine=engine, - context=context, - device=device, - input_name=input_name, - outputs=outputs, - min_batch_size=trt_config.static_batch_size, - max_batch_size=trt_config.static_batch_size, - use_cuda_graph=False, - trt_cuda_graph_cache=None, - ) - return results - results, _ = infer_from_trt_engine_with_batch_size_boundaries( + min_batch_size = trt_config.static_batch_size + max_batch_size = trt_config.static_batch_size + else: + min_batch_size = trt_config.dynamic_batch_size_min + max_batch_size = trt_config.dynamic_batch_size_max + return _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, device=device, input_name=input_name, outputs=outputs, - min_batch_size=trt_config.dynamic_batch_size_min, - max_batch_size=trt_config.dynamic_batch_size_max, - use_cuda_graph=False, - trt_cuda_graph_cache=None, - ) - return results - - -def infer_from_trt_engine_with_cudagraph( - pre_processed_images: torch.Tensor, - trt_config: TRTConfig, - engine: trt.ICudaEngine, - device: torch.device, - input_name: str, - outputs: List[str], - trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: - """Run inference using a TensorRT engine with CUDA graph support. - - Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for - improved performance on repeated inference with the same input shape. - - Args: - pre_processed_images: Preprocessed input tensor on CUDA device. - trt_config: TensorRT configuration object. - engine: TensorRT CUDA engine (ICudaEngine). - device: PyTorch CUDA device. - input_name: Name of the input tensor in the TensorRT engine. - outputs: List of output tensor names. - trt_cuda_graph_cache: Optional state from a previous call for graph replay. - - Returns: - Tuple of (results, trt_cuda_graph_cache) where results is the list of - output tensors and trt_cuda_graph_cache can be passed to subsequent calls. - """ - if trt_config.static_batch_size is not None: - return infer_from_trt_engine_with_batch_size_boundaries( - pre_processed_images=pre_processed_images, - engine=engine, - context=None, # the graph cache has its own contexts - device=device, - input_name=input_name, - outputs=outputs, - min_batch_size=trt_config.static_batch_size, - max_batch_size=trt_config.static_batch_size, - use_cuda_graph=True, - trt_cuda_graph_cache=trt_cuda_graph_cache, - ) - return infer_from_trt_engine_with_batch_size_boundaries( - pre_processed_images=pre_processed_images, - engine=engine, - context=None, # the graph cache has its own contexts - device=device, - input_name=input_name, - outputs=outputs, - min_batch_size=trt_config.dynamic_batch_size_min, - max_batch_size=trt_config.dynamic_batch_size_max, - use_cuda_graph=True, + min_batch_size=min_batch_size, + max_batch_size=max_batch_size, trt_cuda_graph_cache=trt_cuda_graph_cache, ) -def infer_from_trt_engine_with_batch_size_boundaries( +def _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, - context: trt.IExecutionContext, + context: Optional[trt.IExecutionContext], device: torch.device, input_name: str, outputs: List[str], min_batch_size: int, max_batch_size: int, - use_cuda_graph: bool = False, trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: +) -> List[torch.Tensor]: if pre_processed_images.shape[0] <= max_batch_size: reminder = min_batch_size - pre_processed_images.shape[0] if reminder > 0: @@ -369,19 +332,18 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results, trt_cuda_graph_cache = execute_trt_engine( + results = _execute_trt_engine( pre_processed_images=pre_processed_images, engine=engine, context=context, device=device, input_name=input_name, outputs=outputs, - use_cuda_graph=use_cuda_graph, trt_cuda_graph_cache=trt_cuda_graph_cache, ) if reminder > 0: results = [r[:-reminder] for r in results] - return results, trt_cuda_graph_cache + return results all_results = [] for _ in outputs: all_results.append([]) @@ -400,37 +362,32 @@ def infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results, trt_cuda_graph_cache = execute_trt_engine( + results = _execute_trt_engine( pre_processed_images=batch, engine=engine, context=context, device=device, input_name=input_name, outputs=outputs, - use_cuda_graph=use_cuda_graph, trt_cuda_graph_cache=trt_cuda_graph_cache, ) if reminder > 0: results = [r[:-reminder] for r in results] for partial_result, all_result_element in zip(results, all_results): all_result_element.append(partial_result) - return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_cache + return [torch.cat(e, dim=0).contiguous() for e in all_results] -def execute_trt_engine( +def _execute_trt_engine( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, - context: trt.IExecutionContext, + context: Optional[trt.IExecutionContext], device: torch.device, input_name: str, outputs: List[str], - use_cuda_graph: bool = False, trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, -) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]: - if use_cuda_graph: - if trt_cuda_graph_cache is None: - trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=64) - +) -> List[torch.Tensor]: + if trt_cuda_graph_cache is not None: input_shape = tuple(pre_processed_images.shape) input_dtype = pre_processed_images.dtype cache_key = (input_shape, input_dtype, device) @@ -446,7 +403,7 @@ def execute_trt_engine( outputs=outputs, ) trt_cuda_graph_cache[cache_key] = trt_cuda_graph - return results, trt_cuda_graph_cache + return results else: trt_cuda_graph_state = trt_cuda_graph_cache[cache_key] @@ -456,7 +413,7 @@ def execute_trt_engine( trt_cuda_graph_state.cuda_graph.replay() results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers] stream.synchronize() - return results, trt_cuda_graph_cache + return results else: status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) @@ -490,7 +447,7 @@ def execute_trt_engine( help_url="https://todo", ) stream.synchronize() - return results, None + return results def _capture_cuda_graph( diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 745e2c5cd9..22ffdc9a22 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -33,11 +33,10 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphLRUCache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, - infer_from_trt_engine_with_cudagraph, load_trt_model, - TRTCudaGraphLRUCache, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -82,6 +81,7 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + cuda_graph_cache_capacity: int = 64, **kwargs, ) -> "RFDetrForInstanceSegmentationTRT": if device.type != "cuda": @@ -150,6 +150,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + cuda_graph_cache_capacity=cuda_graph_cache_capacity, ) def __init__( @@ -164,6 +165,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + cuda_graph_cache_capacity: int = 64, ): self._engine = engine self._input_name = input_name @@ -175,7 +177,9 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None + self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( + capacity=cuda_graph_cache_capacity, + ) self._lock = threading.Lock() @property @@ -207,30 +211,19 @@ def forward( if use_cuda_graph is None: use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + cache = self._trt_cuda_graph_cache if use_cuda_graph else None with self._lock: with use_cuda_context(context=self._cuda_context): - if use_cuda_graph: - (detections, labels, masks), self._trt_cuda_graph_cache = ( - infer_from_trt_engine_with_cudagraph( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - trt_cuda_graph_cache=self._trt_cuda_graph_cache, - ) - ) - else: - detections, labels, masks = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - ) + detections, labels, masks = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context if not use_cuda_graph else None, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_cache=cache, + ) return detections, labels, masks def post_process( diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index d6ac66e84e..5b163da87c 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -36,11 +36,10 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphLRUCache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, - infer_from_trt_engine_with_cudagraph, load_trt_model, - TRTCudaGraphLRUCache, ) from inference_models.models.rfdetr.class_remapping import ( ClassesReMapping, @@ -82,6 +81,7 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + cuda_graph_cache_capacity: int = 64, **kwargs, ) -> "RFDetrForObjectDetectionTRT": if device.type != "cuda": @@ -155,6 +155,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + cuda_graph_cache_capacity=cuda_graph_cache_capacity, ) def __init__( @@ -169,6 +170,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + cuda_graph_cache_capacity: int = 64, ): self._engine = engine self._input_name = input_name @@ -180,7 +182,9 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None + self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( + capacity=cuda_graph_cache_capacity, + ) self._lock = threading.Lock() @property @@ -210,30 +214,19 @@ def forward( if use_cuda_graph is None: use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + cache = self._trt_cuda_graph_cache if use_cuda_graph else None with self._lock: with use_cuda_context(context=self._cuda_context): - if use_cuda_graph: - (detections, labels), self._trt_cuda_graph_cache = ( - infer_from_trt_engine_with_cudagraph( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - trt_cuda_graph_cache=self._trt_cuda_graph_cache, - ) - ) - else: - detections, labels = infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - ) + detections, labels = infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context if not use_cuda_graph else None, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_cache=cache, + ) return detections, labels def post_process( diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index ae0cda31fa..3794d8ee1e 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -41,7 +41,6 @@ TRTCudaGraphLRUCache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, - infer_from_trt_engine_with_cudagraph, load_trt_model, ) @@ -77,6 +76,7 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + cuda_graph_cache_capacity: int = 64, **kwargs, ) -> "YOLOv8ForObjectDetectionTRT": if device.type != "cuda": @@ -142,6 +142,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + cuda_graph_cache_capacity=cuda_graph_cache_capacity, ) def __init__( @@ -155,6 +156,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + cuda_graph_cache_capacity: int = 64, ): self._engine = engine self._input_name = input_name @@ -165,7 +167,9 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context - self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None + self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( + capacity=cuda_graph_cache_capacity, + ) self._lock = threading.Lock() @property @@ -195,31 +199,19 @@ def forward( if use_cuda_graph is None: use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND + cache = self._trt_cuda_graph_cache if use_cuda_graph else None with self._lock: with use_cuda_context(context=self._cuda_context): - if use_cuda_graph: - results, self._trt_cuda_graph_cache = ( - infer_from_trt_engine_with_cudagraph( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - trt_cuda_graph_cache=self._trt_cuda_graph_cache, - ) - ) - return results[0] - else: - return infer_from_trt_engine( - pre_processed_images=pre_processed_images, - trt_config=self._trt_config, - engine=self._engine, - context=self._execution_context, - device=self._device, - input_name=self._input_name, - outputs=self._output_names, - )[0] + return infer_from_trt_engine( + pre_processed_images=pre_processed_images, + trt_config=self._trt_config, + engine=self._engine, + context=self._execution_context if not use_cuda_graph else None, + device=self._device, + input_name=self._input_name, + outputs=self._output_names, + trt_cuda_graph_cache=cache, + )[0] def post_process( self, From 51f191ced33f9d725e1d87e8bb7081ce0e9e5a10 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 18:57:10 +0000 Subject: [PATCH 23/50] working on memory profiling for cudagraphs --- .../profiling/profile_cudagraph_vram.py | 207 ++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 inference_models/development/profiling/profile_cudagraph_vram.py diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py new file mode 100644 index 0000000000..0c0b3ddfdd --- /dev/null +++ b/inference_models/development/profiling/profile_cudagraph_vram.py @@ -0,0 +1,207 @@ +"""Profile VRAM usage as the number of cached CUDA graphs grows. + +Loads yolov8n-640 as a TRT model with dynamic batch size, then runs forward +passes with varying batch sizes (in shuffled order) to force new graph captures. + +Measures VRAM two ways after each capture: + - "Tensor bytes": directly summed from input_buffer + output_buffers in the cache. + - "Driver bytes": total GPU memory used, via torch.cuda.mem_get_info() which + queries the NVIDIA driver. This captures opaque allocations (TRT execution + contexts, CUDA graph objects, streams, internal workspace) that are invisible + to PyTorch's allocator. + +The difference (driver - tensor - baseline) isolates the opaque overhead. + +Example invocation: + python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 32 + + python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 16 --output vram.png +""" + +import argparse +import gc +import random +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from inference_models import AutoModel +from inference_models.models.common.trt import TRTCudaGraphLRUCache, TRTCudaGraphState + +MODEL_ID = "yolov8n-640" + + +def graph_state_tensor_bytes(state: TRTCudaGraphState) -> int: + total = state.input_buffer.nbytes + for buf in state.output_buffers: + total += buf.nbytes + return total + + +def cache_total_tensor_bytes(cache: TRTCudaGraphLRUCache) -> int: + total = 0 + for state in cache.cache.values(): + total += graph_state_tensor_bytes(state) + return total + + +def driver_used_bytes(device: torch.device) -> int: + free, total = torch.cuda.mem_get_info(device) + return total - free + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Profile VRAM usage vs. number of cached CUDA graphs (varying batch size).", + ) + parser.add_argument( + "--device", + type=str, + default="cuda:0", + ) + parser.add_argument( + "--max-batch-size", + type=int, + default=16, + help="Largest batch size to test. Each batch size from 1..max creates a new graph.", + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Path to save the plot image. Defaults to 'vram_yolov8n-640.png'.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + device = torch.device(args.device) + + model = AutoModel.from_pretrained( + model_id_or_path=MODEL_ID, + device=device, + backend="trt", + batch_size=(1, args.max_batch_size), + cuda_graph_cache_capacity=args.max_batch_size + 10, + ) + + image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8) + single_preprocessed, _ = model.pre_process(image) + + model.forward(single_preprocessed, use_cuda_graph=False) + torch.cuda.synchronize(device) + gc.collect() + torch.cuda.empty_cache() + + baseline_driver_bytes = driver_used_bytes(device) + + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache( + capacity=args.max_batch_size + 10, + ) + + batch_size_order = list(range(1, args.max_batch_size + 1)) + random.Random(42).shuffle(batch_size_order) + + batch_sizes = [] + cumulative_tensor_mb = [] + cumulative_driver_mb = [] + per_graph_tensor_mb = [] + per_graph_driver_mb = [] + + prev_tensor_bytes = 0 + prev_driver_bytes = baseline_driver_bytes + for i, bs in enumerate(batch_size_order): + batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous() + model.forward(batched, use_cuda_graph=True) + torch.cuda.synchronize(device) + + tensor_bytes = cache_total_tensor_bytes(model._trt_cuda_graph_cache) + drv_bytes = driver_used_bytes(device) + + tensor_delta = tensor_bytes - prev_tensor_bytes + driver_delta = drv_bytes - prev_driver_bytes + + batch_sizes.append(bs) + cumulative_tensor_mb.append(tensor_bytes / (1024 ** 2)) + cumulative_driver_mb.append((drv_bytes - baseline_driver_bytes) / (1024 ** 2)) + per_graph_tensor_mb.append(tensor_delta / (1024 ** 2)) + per_graph_driver_mb.append(driver_delta / (1024 ** 2)) + + prev_tensor_bytes = tensor_bytes + prev_driver_bytes = drv_bytes + print( + f"[{i + 1}/{args.max_batch_size}] " + f"bs={bs:>2d} | " + f"tensors: {tensor_bytes / (1024 ** 2):>7.1f} MB (+{tensor_delta / (1024 ** 2):>6.1f}) | " + f"driver: {(drv_bytes - baseline_driver_bytes) / (1024 ** 2):>7.1f} MB (+{driver_delta / (1024 ** 2):>6.1f})" + ) + + output_path = Path(args.output) if args.output else Path(f"vram_{MODEL_ID}.png") + + fig, axes = plt.subplots(2, 1, figsize=(14, 10)) + fig.suptitle( + f"CUDA Graph Cache VRAM (varying batch size) — {MODEL_ID}", + fontsize=14, + ) + + capture_order = list(range(1, len(batch_sizes) + 1)) + bar_width = 0.35 + + ax_cum = axes[0] + x_cum = np.arange(len(capture_order)) + ax_cum.bar( + x_cum - bar_width / 2, cumulative_driver_mb, bar_width, + color="steelblue", label="Driver-level (total GPU)", + ) + ax_cum.bar( + x_cum + bar_width / 2, cumulative_tensor_mb, bar_width, + color="darkorange", label="Cache tensors only", + ) + ax_cum.set_ylabel("Cumulative VRAM above baseline (MB)") + ax_cum.set_xlabel("Number of Cached Graphs (capture order)") + ax_cum.set_xticks(x_cum) + ax_cum.set_xticklabels( + [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)], + fontsize=7, + ) + ax_cum.legend() + + sorted_indices = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k]) + sorted_bs = [batch_sizes[k] for k in sorted_indices] + sorted_driver = [per_graph_driver_mb[k] for k in sorted_indices] + sorted_tensor = [per_graph_tensor_mb[k] for k in sorted_indices] + + ax_pg = axes[1] + x_pg = np.arange(len(sorted_bs)) + ax_pg.bar( + x_pg - bar_width / 2, sorted_driver, bar_width, + color="steelblue", label="Driver-level (total GPU)", + ) + ax_pg.bar( + x_pg + bar_width / 2, sorted_tensor, bar_width, + color="darkorange", label="Cache tensors only", + ) + ax_pg.set_ylabel("Per-Graph VRAM (MB)") + ax_pg.set_xlabel("Batch Size") + ax_pg.set_xticks(x_pg) + ax_pg.set_xticklabels([str(bs) for bs in sorted_bs]) + ax_pg.legend() + + plt.tight_layout() + fig.savefig(output_path, dpi=150) + print(f"\nPlot saved to {output_path}") + + total_tensor = prev_tensor_bytes / (1024 ** 2) + total_driver = (prev_driver_bytes - baseline_driver_bytes) / (1024 ** 2) + n = len(batch_sizes) + print(f"\nAfter {n} graphs:") + print(f" Cache tensor VRAM: {total_tensor:.1f} MB (avg {total_tensor / n:.1f} MB/graph)") + print(f" Driver-level VRAM: {total_driver:.1f} MB (avg {total_driver / n:.1f} MB/graph)") + print(f" Opaque overhead: {total_driver - total_tensor:.1f} MB (avg {(total_driver - total_tensor) / n:.1f} MB/graph)") + + +if __name__ == "__main__": + main() From a80a5727897508a6ae34fb4ce3ecc2f2daa234f4 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 19:29:49 +0000 Subject: [PATCH 24/50] simplify memory profiling script --- .../profiling/profile_cudagraph_vram.py | 192 +++++++----------- 1 file changed, 77 insertions(+), 115 deletions(-) diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py index 0c0b3ddfdd..412458e114 100644 --- a/inference_models/development/profiling/profile_cudagraph_vram.py +++ b/inference_models/development/profiling/profile_cudagraph_vram.py @@ -1,25 +1,20 @@ -"""Profile VRAM usage as the number of cached CUDA graphs grows. +"""Profile GPU and CPU memory usage as CUDA graphs are cached. -Loads yolov8n-640 as a TRT model with dynamic batch size, then runs forward -passes with varying batch sizes (in shuffled order) to force new graph captures. - -Measures VRAM two ways after each capture: - - "Tensor bytes": directly summed from input_buffer + output_buffers in the cache. - - "Driver bytes": total GPU memory used, via torch.cuda.mem_get_info() which - queries the NVIDIA driver. This captures opaque allocations (TRT execution - contexts, CUDA graph objects, streams, internal workspace) that are invisible - to PyTorch's allocator. - -The difference (driver - tensor - baseline) isolates the opaque overhead. +Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes +with batch sizes 1-16 in a deterministic random order, and after each capture +records both GPU VRAM (driver-level) and process CPU RSS. Produces a two-panel +plot: cumulative memory over capture order, and per-graph delta sorted by batch +size. Example invocation: - python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 32 + python profile_cudagraph_vram.py --device cuda:0 - python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 16 --output vram.png + python profile_cudagraph_vram.py --device cuda:0 --shuffle --max-batch-size 32 --output mem.png """ import argparse import gc +import os import random from pathlib import Path @@ -28,51 +23,31 @@ import torch from inference_models import AutoModel -from inference_models.models.common.trt import TRTCudaGraphLRUCache, TRTCudaGraphState +from inference_models.models.common.trt import TRTCudaGraphLRUCache MODEL_ID = "yolov8n-640" +MB = 1024 ** 2 -def graph_state_tensor_bytes(state: TRTCudaGraphState) -> int: - total = state.input_buffer.nbytes - for buf in state.output_buffers: - total += buf.nbytes - return total - - -def cache_total_tensor_bytes(cache: TRTCudaGraphLRUCache) -> int: - total = 0 - for state in cache.cache.values(): - total += graph_state_tensor_bytes(state) - return total - - -def driver_used_bytes(device: torch.device) -> int: +def gpu_used_bytes(device: torch.device) -> int: free, total = torch.cuda.mem_get_info(device) return total - free +def cpu_rss_bytes() -> int: + with open(f"/proc/{os.getpid()}/statm") as f: + pages = int(f.read().split()[1]) + return pages * os.sysconf("SC_PAGE_SIZE") + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Profile VRAM usage vs. number of cached CUDA graphs (varying batch size).", - ) - parser.add_argument( - "--device", - type=str, - default="cuda:0", - ) - parser.add_argument( - "--max-batch-size", - type=int, - default=16, - help="Largest batch size to test. Each batch size from 1..max creates a new graph.", - ) - parser.add_argument( - "--output", - type=str, - default=None, - help="Path to save the plot image. Defaults to 'vram_yolov8n-640.png'.", + description="Profile GPU + CPU memory vs. number of cached CUDA graphs.", ) + parser.add_argument("--device", type=str, default="cuda:0") + parser.add_argument("--max-batch-size", type=int, default=16) + parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order (deterministic seed).") + parser.add_argument("--output", type=str, default=None) return parser.parse_args() @@ -92,115 +67,102 @@ def main() -> None: single_preprocessed, _ = model.pre_process(image) model.forward(single_preprocessed, use_cuda_graph=False) - torch.cuda.synchronize(device) gc.collect() + torch.cuda.synchronize(device) torch.cuda.empty_cache() - baseline_driver_bytes = driver_used_bytes(device) + baseline_gpu = gpu_used_bytes(device) + baseline_cpu = cpu_rss_bytes() model._trt_cuda_graph_cache = TRTCudaGraphLRUCache( capacity=args.max_batch_size + 10, ) batch_size_order = list(range(1, args.max_batch_size + 1)) - random.Random(42).shuffle(batch_size_order) + if args.shuffle: + random.Random(42).shuffle(batch_size_order) batch_sizes = [] - cumulative_tensor_mb = [] - cumulative_driver_mb = [] - per_graph_tensor_mb = [] - per_graph_driver_mb = [] + cumulative_gpu_mb = [] + cumulative_cpu_mb = [] + delta_gpu_mb = [] + delta_cpu_mb = [] + + prev_gpu = baseline_gpu + prev_cpu = baseline_cpu - prev_tensor_bytes = 0 - prev_driver_bytes = baseline_driver_bytes for i, bs in enumerate(batch_size_order): batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous() - model.forward(batched, use_cuda_graph=True) + output = model.forward(batched, use_cuda_graph=True) + del output + gc.collect() torch.cuda.synchronize(device) - tensor_bytes = cache_total_tensor_bytes(model._trt_cuda_graph_cache) - drv_bytes = driver_used_bytes(device) - - tensor_delta = tensor_bytes - prev_tensor_bytes - driver_delta = drv_bytes - prev_driver_bytes + gpu = gpu_used_bytes(device) + cpu = cpu_rss_bytes() batch_sizes.append(bs) - cumulative_tensor_mb.append(tensor_bytes / (1024 ** 2)) - cumulative_driver_mb.append((drv_bytes - baseline_driver_bytes) / (1024 ** 2)) - per_graph_tensor_mb.append(tensor_delta / (1024 ** 2)) - per_graph_driver_mb.append(driver_delta / (1024 ** 2)) + cumulative_gpu_mb.append((gpu - baseline_gpu) / MB) + cumulative_cpu_mb.append((cpu - baseline_cpu) / MB) + delta_gpu_mb.append((gpu - prev_gpu) / MB) + delta_cpu_mb.append((cpu - prev_cpu) / MB) - prev_tensor_bytes = tensor_bytes - prev_driver_bytes = drv_bytes print( - f"[{i + 1}/{args.max_batch_size}] " - f"bs={bs:>2d} | " - f"tensors: {tensor_bytes / (1024 ** 2):>7.1f} MB (+{tensor_delta / (1024 ** 2):>6.1f}) | " - f"driver: {(drv_bytes - baseline_driver_bytes) / (1024 ** 2):>7.1f} MB (+{driver_delta / (1024 ** 2):>6.1f})" + f"[{i + 1}/{args.max_batch_size}] bs={bs:>2d} | " + f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{delta_gpu_mb[-1]:>6.1f}) | " + f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{delta_cpu_mb[-1]:>6.1f})" ) + prev_gpu = gpu + prev_cpu = cpu - output_path = Path(args.output) if args.output else Path(f"vram_{MODEL_ID}.png") + autogenerated_name = f"vram_{MODEL_ID}_{'shuffle' if args.shuffle else 'sequential'}.png" + output_path = Path(args.output) if args.output else Path(autogenerated_name) - fig, axes = plt.subplots(2, 1, figsize=(14, 10)) + fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10)) fig.suptitle( - f"CUDA Graph Cache VRAM (varying batch size) — {MODEL_ID}", + f"Memory vs. CUDA Graph Count (varying batch size) — {MODEL_ID}", fontsize=14, ) capture_order = list(range(1, len(batch_sizes) + 1)) - bar_width = 0.35 - - ax_cum = axes[0] - x_cum = np.arange(len(capture_order)) - ax_cum.bar( - x_cum - bar_width / 2, cumulative_driver_mb, bar_width, - color="steelblue", label="Driver-level (total GPU)", - ) - ax_cum.bar( - x_cum + bar_width / 2, cumulative_tensor_mb, bar_width, - color="darkorange", label="Cache tensors only", - ) - ax_cum.set_ylabel("Cumulative VRAM above baseline (MB)") - ax_cum.set_xlabel("Number of Cached Graphs (capture order)") - ax_cum.set_xticks(x_cum) + x = np.arange(len(capture_order)) + w = 0.35 + + ax_cum.bar(x - w / 2, cumulative_gpu_mb, w, color="steelblue", label="GPU VRAM") + ax_cum.bar(x + w / 2, cumulative_cpu_mb, w, color="seagreen", label="CPU RSS") + ax_cum.set_ylabel("Memory above baseline (MB)") + ax_cum.set_xlabel("Capture order") + ax_cum.set_xticks(x) ax_cum.set_xticklabels( [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)], fontsize=7, ) ax_cum.legend() - sorted_indices = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k]) - sorted_bs = [batch_sizes[k] for k in sorted_indices] - sorted_driver = [per_graph_driver_mb[k] for k in sorted_indices] - sorted_tensor = [per_graph_tensor_mb[k] for k in sorted_indices] + sorted_idx = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k]) + s_bs = [batch_sizes[k] for k in sorted_idx] + s_gpu = [delta_gpu_mb[k] for k in sorted_idx] + s_cpu = [delta_cpu_mb[k] for k in sorted_idx] - ax_pg = axes[1] - x_pg = np.arange(len(sorted_bs)) - ax_pg.bar( - x_pg - bar_width / 2, sorted_driver, bar_width, - color="steelblue", label="Driver-level (total GPU)", - ) - ax_pg.bar( - x_pg + bar_width / 2, sorted_tensor, bar_width, - color="darkorange", label="Cache tensors only", - ) - ax_pg.set_ylabel("Per-Graph VRAM (MB)") - ax_pg.set_xlabel("Batch Size") - ax_pg.set_xticks(x_pg) - ax_pg.set_xticklabels([str(bs) for bs in sorted_bs]) - ax_pg.legend() + x2 = np.arange(len(s_bs)) + ax_delta.bar(x2 - w / 2, s_gpu, w, color="steelblue", label="GPU VRAM") + ax_delta.bar(x2 + w / 2, s_cpu, w, color="seagreen", label="CPU RSS") + ax_delta.set_ylabel("Per-graph memory delta (MB)") + ax_delta.set_xlabel("Batch size") + ax_delta.set_xticks(x2) + ax_delta.set_xticklabels([str(bs) for bs in s_bs]) + ax_delta.legend() plt.tight_layout() fig.savefig(output_path, dpi=150) print(f"\nPlot saved to {output_path}") - total_tensor = prev_tensor_bytes / (1024 ** 2) - total_driver = (prev_driver_bytes - baseline_driver_bytes) / (1024 ** 2) + final_gpu = (prev_gpu - baseline_gpu) / MB + final_cpu = (prev_cpu - baseline_cpu) / MB n = len(batch_sizes) print(f"\nAfter {n} graphs:") - print(f" Cache tensor VRAM: {total_tensor:.1f} MB (avg {total_tensor / n:.1f} MB/graph)") - print(f" Driver-level VRAM: {total_driver:.1f} MB (avg {total_driver / n:.1f} MB/graph)") - print(f" Opaque overhead: {total_driver - total_tensor:.1f} MB (avg {(total_driver - total_tensor) / n:.1f} MB/graph)") + print(f" GPU VRAM: +{final_gpu:.1f} MB total ({final_gpu / n:.1f} MB/graph avg)") + print(f" CPU RSS: +{final_cpu:.1f} MB total ({final_cpu / n:.1f} MB/graph avg)") if __name__ == "__main__": From 845fabd0b601be2cb1c603c596cf6493449b27cc Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 20:42:49 +0000 Subject: [PATCH 25/50] tweaks --- inference_models/inference_models/models/common/trt.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 1e6ca5c7d8..c618aadbef 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -294,7 +294,7 @@ def infer_from_trt_engine( else: min_batch_size = trt_config.dynamic_batch_size_min max_batch_size = trt_config.dynamic_batch_size_max - return _infer_from_trt_engine_with_batch_size_boundaries( + return infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -307,7 +307,7 @@ def infer_from_trt_engine( ) -def _infer_from_trt_engine_with_batch_size_boundaries( +def infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, context: Optional[trt.IExecutionContext], @@ -332,7 +332,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results = _execute_trt_engine( + results = execute_trt_engine( pre_processed_images=pre_processed_images, engine=engine, context=context, @@ -362,7 +362,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries( ), dim=0, ) - results = _execute_trt_engine( + results = execute_trt_engine( pre_processed_images=batch, engine=engine, context=context, @@ -378,7 +378,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries( return [torch.cat(e, dim=0).contiguous() for e in all_results] -def _execute_trt_engine( +def execute_trt_engine( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, context: Optional[trt.IExecutionContext], From 3294cae83ea59edcd82ca416c9d54bd19c18d20f Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 20:51:18 +0000 Subject: [PATCH 26/50] update tests to work with the new cache --- .../integration_tests/models/test_rfdetr_predictions_trt.py | 4 +++- .../models/test_rfdetr_seg_predictions_trt.py | 4 +++- .../models/test_yolov8_object_detection_predictions_trt.py | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index 4768fc9043..44ac3dec22 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -4,6 +4,8 @@ import pytest import torch +from inference_models.models.common.trt import TRTCudaGraphLRUCache + @pytest.mark.slow @pytest.mark.trt_extras @@ -25,7 +27,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_state = None + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py index c5591aab9e..2e8c9759fe 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -2,6 +2,8 @@ import pytest import torch +from inference_models.models.common.trt import TRTCudaGraphLRUCache + @pytest.mark.slow @pytest.mark.trt_extras @@ -23,7 +25,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index 6031df5c6e..f5e8e19001 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -2,6 +2,8 @@ import pytest import torch +from inference_models.models.common.trt import TRTCudaGraphLRUCache + @pytest.mark.slow @pytest.mark.trt_extras @@ -18,7 +20,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( ) pre_processed_single, _ = model.pre_process(dog_image_numpy) - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() seen_shapes = set() capture_outputs = {} @@ -69,7 +71,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( no_graph = model.forward(batch, use_cuda_graph=False) - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() capture_graph = model.forward(batch, use_cuda_graph=True) replay_graph = model.forward(batch, use_cuda_graph=True) From bbb25405c9a3662a0dde20aa9977db09f713c0c2 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:01:26 +0000 Subject: [PATCH 27/50] thanks for the PR review, Claude --- .../inference_models/models/common/trt.py | 13 +++++++++++-- .../test_yolov8_object_detection_predictions_trt.py | 6 +++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index c618aadbef..19553cb48f 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -95,7 +95,11 @@ def __setitem__( self.cache[key] = value self.cache.move_to_end(key) if len(self.cache) > self.capacity: - self.cache.popitem(last=False) + _, evicted = self.cache.popitem(last=False) + del evicted.cuda_graph + del evicted.input_buffer + del evicted.output_buffers + del evicted.execution_context def get_trt_engine_inputs_and_outputs( @@ -393,7 +397,7 @@ def execute_trt_engine( cache_key = (input_shape, input_dtype, device) if cache_key not in trt_cuda_graph_cache: - LOGGER.debug(f"Capturing CUDA graph for shape {input_shape}") + LOGGER.debug("Capturing CUDA graph for shape %s", input_shape) results, trt_cuda_graph = _capture_cuda_graph( pre_processed_images=pre_processed_images, @@ -416,6 +420,11 @@ def execute_trt_engine( return results else: + if context is None: + raise ModelRuntimeError( + message="An execution context is required when not using CUDA graphs.", + help_url="https://todo", + ) status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index f5e8e19001..cb1b5bc238 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -46,7 +46,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( continue assert cache_size_after == cache_size_before - assert torch.allclose(capture_outputs[cache_key], output, atol=1e-3) + assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6) assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes @@ -75,5 +75,5 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( capture_graph = model.forward(batch, use_cuda_graph=True) replay_graph = model.forward(batch, use_cuda_graph=True) - assert torch.allclose(no_graph, capture_graph, atol=1e-3) - assert torch.allclose(no_graph, replay_graph, atol=1e-3) + assert torch.allclose(no_graph, capture_graph, atol=1e-6) + assert torch.allclose(no_graph, replay_graph, atol=1e-6) From 4eb23fce0754379fac6f61ce4c3f92c8cbbcac2d Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:28:30 +0000 Subject: [PATCH 28/50] see effect of cache size on vram profile script --- .../profiling/profile_cudagraph_vram.py | 134 +++++++++++------- 1 file changed, 85 insertions(+), 49 deletions(-) diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py index 412458e114..1d2d9a0964 100644 --- a/inference_models/development/profiling/profile_cudagraph_vram.py +++ b/inference_models/development/profiling/profile_cudagraph_vram.py @@ -1,15 +1,34 @@ -"""Profile GPU and CPU memory usage as CUDA graphs are cached. +"""Profile GPU and CPU memory usage as CUDA graphs are cached and evicted. Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes -with batch sizes 1-16 in a deterministic random order, and after each capture -records both GPU VRAM (driver-level) and process CPU RSS. Produces a two-panel -plot: cumulative memory over capture order, and per-graph delta sorted by batch -size. +with random batch sizes, and after each step records both GPU VRAM +(driver-level) and process CPU RSS. The cache capacity is smaller than the +number of distinct batch sizes, so eviction is exercised and memory usage +should plateau. Example invocation: - python profile_cudagraph_vram.py --device cuda:0 - - python profile_cudagraph_vram.py --device cuda:0 --shuffle --max-batch-size 32 --output mem.png + python profile_cudagraph_vram.py \ + --device cuda:0 \ + --num-steps 64 \ + --max-batch-size 16 \ + --cache-capacity 16 \ + --output vram_sequential.png + + python profile_cudagraph_vram.py \ + --device cuda:0 \ + --num-steps 64 \ + --max-batch-size 16 \ + --cache-capacity 16 \ + --shuffle \ + --output vram_shuffle.png + + python profile_cudagraph_vram.py \ + --device cuda:0 \ + --shuffle \ + --num-steps 64 \ + --max-batch-size 16 \ + --cache-capacity 8 \ + --output vram_shuffle_eviction.png """ import argparse @@ -46,7 +65,10 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument("--device", type=str, default="cuda:0") parser.add_argument("--max-batch-size", type=int, default=16) - parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order (deterministic seed).") + parser.add_argument("--cache-capacity", type=int, default=8) + parser.add_argument("--num-steps", type=int, default=32) + parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order instead of sequential cycling.") + parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default=None) return parser.parse_args() @@ -55,12 +77,14 @@ def main() -> None: args = parse_args() device = torch.device(args.device) + rng = random.Random(args.seed) + model = AutoModel.from_pretrained( model_id_or_path=MODEL_ID, device=device, backend="trt", batch_size=(1, args.max_batch_size), - cuda_graph_cache_capacity=args.max_batch_size + 10, + cuda_graph_cache_capacity=args.cache_capacity, ) image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8) @@ -75,23 +99,31 @@ def main() -> None: baseline_cpu = cpu_rss_bytes() model._trt_cuda_graph_cache = TRTCudaGraphLRUCache( - capacity=args.max_batch_size + 10, + capacity=args.cache_capacity, ) - batch_size_order = list(range(1, args.max_batch_size + 1)) if args.shuffle: - random.Random(42).shuffle(batch_size_order) + batch_size_sequence = [ + rng.randint(1, args.max_batch_size) for _ in range(args.num_steps) + ] + else: + all_sizes = list(range(1, args.max_batch_size + 1)) + batch_size_sequence = [ + all_sizes[i % len(all_sizes)] for i in range(args.num_steps) + ] + + from collections import defaultdict batch_sizes = [] cumulative_gpu_mb = [] cumulative_cpu_mb = [] - delta_gpu_mb = [] - delta_cpu_mb = [] + gpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list) + cpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list) prev_gpu = baseline_gpu prev_cpu = baseline_cpu - for i, bs in enumerate(batch_size_order): + for i, bs in enumerate(batch_size_sequence): batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous() output = model.forward(batched, use_cuda_graph=True) del output @@ -100,69 +132,73 @@ def main() -> None: gpu = gpu_used_bytes(device) cpu = cpu_rss_bytes() + cache_size = len(model._trt_cuda_graph_cache.cache) + + gpu_delta = (gpu - prev_gpu) / MB + cpu_delta = (cpu - prev_cpu) / MB batch_sizes.append(bs) cumulative_gpu_mb.append((gpu - baseline_gpu) / MB) cumulative_cpu_mb.append((cpu - baseline_cpu) / MB) - delta_gpu_mb.append((gpu - prev_gpu) / MB) - delta_cpu_mb.append((cpu - prev_cpu) / MB) + gpu_deltas_by_bs[bs].append(gpu_delta) + cpu_deltas_by_bs[bs].append(cpu_delta) print( - f"[{i + 1}/{args.max_batch_size}] bs={bs:>2d} | " - f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{delta_gpu_mb[-1]:>6.1f}) | " - f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{delta_cpu_mb[-1]:>6.1f})" + f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | " + f"cache: {cache_size}/{args.cache_capacity} | " + f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{gpu_delta:>6.1f}) | " + f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{cpu_delta:>6.1f})" ) prev_gpu = gpu prev_cpu = cpu - autogenerated_name = f"vram_{MODEL_ID}_{'shuffle' if args.shuffle else 'sequential'}.png" + mode = "shuffle" if args.shuffle else "sequential" + autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png" output_path = Path(args.output) if args.output else Path(autogenerated_name) fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10)) fig.suptitle( - f"Memory vs. CUDA Graph Count (varying batch size) — {MODEL_ID}", + f"Memory vs. Step (cache capacity={args.cache_capacity}, " + f"batch sizes 1–{args.max_batch_size}) — {MODEL_ID}", fontsize=14, ) - capture_order = list(range(1, len(batch_sizes) + 1)) - x = np.arange(len(capture_order)) - w = 0.35 + steps = np.arange(len(batch_sizes)) - ax_cum.bar(x - w / 2, cumulative_gpu_mb, w, color="steelblue", label="GPU VRAM") - ax_cum.bar(x + w / 2, cumulative_cpu_mb, w, color="seagreen", label="CPU RSS") + ax_cum.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM") + ax_cum.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS") ax_cum.set_ylabel("Memory above baseline (MB)") - ax_cum.set_xlabel("Capture order") - ax_cum.set_xticks(x) - ax_cum.set_xticklabels( - [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)], - fontsize=7, - ) + ax_cum.set_xlabel("Step") + for i, bs in enumerate(batch_sizes): + ax_cum.annotate( + str(bs), (i, cumulative_gpu_mb[i]), + textcoords="offset points", xytext=(0, 6), + fontsize=6, ha="center", color="steelblue", + ) ax_cum.legend() - sorted_idx = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k]) - s_bs = [batch_sizes[k] for k in sorted_idx] - s_gpu = [delta_gpu_mb[k] for k in sorted_idx] - s_cpu = [delta_cpu_mb[k] for k in sorted_idx] + sorted_bs = sorted(gpu_deltas_by_bs.keys()) + avg_gpu = [np.mean(gpu_deltas_by_bs[bs]) for bs in sorted_bs] + avg_cpu = [np.mean(cpu_deltas_by_bs[bs]) for bs in sorted_bs] - x2 = np.arange(len(s_bs)) - ax_delta.bar(x2 - w / 2, s_gpu, w, color="steelblue", label="GPU VRAM") - ax_delta.bar(x2 + w / 2, s_cpu, w, color="seagreen", label="CPU RSS") - ax_delta.set_ylabel("Per-graph memory delta (MB)") + x2 = np.arange(len(sorted_bs)) + w = 0.35 + ax_delta.bar(x2 - w / 2, avg_gpu, w, color="steelblue", label="GPU VRAM") + ax_delta.bar(x2 + w / 2, avg_cpu, w, color="seagreen", label="CPU RSS") + ax_delta.set_ylabel("Mean per-step memory delta (MB)") ax_delta.set_xlabel("Batch size") ax_delta.set_xticks(x2) - ax_delta.set_xticklabels([str(bs) for bs in s_bs]) + ax_delta.set_xticklabels([str(bs) for bs in sorted_bs]) ax_delta.legend() plt.tight_layout() fig.savefig(output_path, dpi=150) print(f"\nPlot saved to {output_path}") - final_gpu = (prev_gpu - baseline_gpu) / MB - final_cpu = (prev_cpu - baseline_cpu) / MB - n = len(batch_sizes) - print(f"\nAfter {n} graphs:") - print(f" GPU VRAM: +{final_gpu:.1f} MB total ({final_gpu / n:.1f} MB/graph avg)") - print(f" CPU RSS: +{final_cpu:.1f} MB total ({final_cpu / n:.1f} MB/graph avg)") + print(f"\nFinal GPU VRAM above baseline: {cumulative_gpu_mb[-1]:.1f} MB") + print(f"Final CPU RSS above baseline: {cumulative_cpu_mb[-1]:.1f} MB") + print(f"Peak GPU VRAM above baseline: {max(cumulative_gpu_mb):.1f} MB") + print(f"Cache entries at end: {cache_size}/{args.cache_capacity}") if __name__ == "__main__": From aa87393f0b29b4a2d5bc63be418d07568cb18ed1 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:40:34 +0000 Subject: [PATCH 29/50] reduce default cache size to 16 after seeing memory usage --- .../profiling/profile_cudagraph_vram.py | 56 ++++++------------ .../rfdetr_instance_segmentation_trt.py | 2 +- .../rfdetr/rfdetr_object_detection_trt.py | 2 +- .../yolov8/yolov8_object_detection_trt.py | 2 +- ...yolov8_object_detection_predictions_trt.py | 57 +++++++++++++++++++ 5 files changed, 78 insertions(+), 41 deletions(-) diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py index 1d2d9a0964..6996c3b98b 100644 --- a/inference_models/development/profiling/profile_cudagraph_vram.py +++ b/inference_models/development/profiling/profile_cudagraph_vram.py @@ -29,6 +29,14 @@ --max-batch-size 16 \ --cache-capacity 8 \ --output vram_shuffle_eviction.png + + python profile_cudagraph_vram.py \ + --device cuda:0 \ + --shuffle \ + --num-steps 64 \ + --max-batch-size 2 \ + --cache-capacity 2 \ + --output vram_two_batch_sizes.png """ import argparse @@ -112,16 +120,9 @@ def main() -> None: all_sizes[i % len(all_sizes)] for i in range(args.num_steps) ] - from collections import defaultdict - batch_sizes = [] cumulative_gpu_mb = [] cumulative_cpu_mb = [] - gpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list) - cpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list) - - prev_gpu = baseline_gpu - prev_cpu = baseline_cpu for i, bs in enumerate(batch_size_sequence): batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous() @@ -134,62 +135,41 @@ def main() -> None: cpu = cpu_rss_bytes() cache_size = len(model._trt_cuda_graph_cache.cache) - gpu_delta = (gpu - prev_gpu) / MB - cpu_delta = (cpu - prev_cpu) / MB - batch_sizes.append(bs) cumulative_gpu_mb.append((gpu - baseline_gpu) / MB) cumulative_cpu_mb.append((cpu - baseline_cpu) / MB) - gpu_deltas_by_bs[bs].append(gpu_delta) - cpu_deltas_by_bs[bs].append(cpu_delta) print( f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | " f"cache: {cache_size}/{args.cache_capacity} | " - f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{gpu_delta:>6.1f}) | " - f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{cpu_delta:>6.1f})" + f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB | " + f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB" ) - prev_gpu = gpu - prev_cpu = cpu mode = "shuffle" if args.shuffle else "sequential" autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png" output_path = Path(args.output) if args.output else Path(autogenerated_name) - fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10)) + fig, ax = plt.subplots(figsize=(14, 6)) fig.suptitle( f"Memory vs. Step (cache capacity={args.cache_capacity}, " - f"batch sizes 1–{args.max_batch_size}) — {MODEL_ID}", + f"batch sizes 1-{args.max_batch_size}) -- {MODEL_ID}", fontsize=14, ) steps = np.arange(len(batch_sizes)) - ax_cum.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM") - ax_cum.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS") - ax_cum.set_ylabel("Memory above baseline (MB)") - ax_cum.set_xlabel("Step") + ax.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM") + ax.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS") + ax.set_ylabel("Memory above baseline (MB)") + ax.set_xlabel("Step") for i, bs in enumerate(batch_sizes): - ax_cum.annotate( + ax.annotate( str(bs), (i, cumulative_gpu_mb[i]), textcoords="offset points", xytext=(0, 6), fontsize=6, ha="center", color="steelblue", ) - ax_cum.legend() - - sorted_bs = sorted(gpu_deltas_by_bs.keys()) - avg_gpu = [np.mean(gpu_deltas_by_bs[bs]) for bs in sorted_bs] - avg_cpu = [np.mean(cpu_deltas_by_bs[bs]) for bs in sorted_bs] - - x2 = np.arange(len(sorted_bs)) - w = 0.35 - ax_delta.bar(x2 - w / 2, avg_gpu, w, color="steelblue", label="GPU VRAM") - ax_delta.bar(x2 + w / 2, avg_cpu, w, color="seagreen", label="CPU RSS") - ax_delta.set_ylabel("Mean per-step memory delta (MB)") - ax_delta.set_xlabel("Batch size") - ax_delta.set_xticks(x2) - ax_delta.set_xticklabels([str(bs) for bs in sorted_bs]) - ax_delta.legend() + ax.legend() plt.tight_layout() fig.savefig(output_path, dpi=150) diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 22ffdc9a22..1aa3e3ed9d 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -81,7 +81,7 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, - cuda_graph_cache_capacity: int = 64, + cuda_graph_cache_capacity: int = 16, **kwargs, ) -> "RFDetrForInstanceSegmentationTRT": if device.type != "cuda": diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index 5b163da87c..29be76b5c1 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -81,7 +81,7 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, - cuda_graph_cache_capacity: int = 64, + cuda_graph_cache_capacity: int = 16, **kwargs, ) -> "RFDetrForObjectDetectionTRT": if device.type != "cuda": diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index 3794d8ee1e..89b067ffbe 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -156,7 +156,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, - cuda_graph_cache_capacity: int = 64, + cuda_graph_cache_capacity: int = 16, ): self._engine = engine self._input_name = input_name diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index cb1b5bc238..bfd9061a3c 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -77,3 +77,60 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( assert torch.allclose(no_graph, capture_graph, atol=1e-6) assert torch.allclose(no_graph, replay_graph, atol=1e-6) + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_cudagraph_cache_eviction( + yolov8n_640_t4_trt_package: str, + dog_image_numpy: np.ndarray, +) -> None: + from inference_models import AutoModel + + device = torch.device("cuda:0") + model = AutoModel.from_pretrained( + model_id_or_path=yolov8n_640_t4_trt_package, + device=device, + ) + + pre_processed_single, _ = model.pre_process(dog_image_numpy) + capacity = 3 + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=capacity) + cache = model._trt_cuda_graph_cache + + batch_sizes = [1, 2, 3] + for bs in batch_sizes: + batch = pre_processed_single.repeat(bs, 1, 1, 1) + model.forward(batch, use_cuda_graph=True) + + assert len(cache.cache) == capacity + keys_before = list(cache.cache.keys()) + + batch_4 = pre_processed_single.repeat(4, 1, 1, 1) + model.forward(batch_4, use_cuda_graph=True) + + assert len(cache.cache) == capacity + assert keys_before[0] not in cache.cache + for key in keys_before[1:]: + assert key in cache.cache + key_4 = (tuple(batch_4.shape), batch_4.dtype, device) + assert key_4 in cache.cache + + batch_2 = pre_processed_single.repeat(2, 1, 1, 1) + model.forward(batch_2, use_cuda_graph=True) + + batch_5 = pre_processed_single.repeat(5, 1, 1, 1) + model.forward(batch_5, use_cuda_graph=True) + + assert len(cache.cache) == capacity + key_3 = (tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), batch_2.dtype, device) + assert key_3 not in cache.cache + + remaining_keys = list(cache.cache.keys()) + key_2 = (tuple(batch_2.shape), batch_2.dtype, device) + key_5 = (tuple(batch_5.shape), batch_5.dtype, device) + assert remaining_keys == [key_4, key_2, key_5] + + no_graph = model.forward(batch_5, use_cuda_graph=False) + replay = model.forward(batch_5, use_cuda_graph=True) + assert torch.allclose(no_graph, replay, atol=1e-6) From b5c1f6b2e8c1d9250f267350fcb44a34d7dcf6ee Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:41:36 +0000 Subject: [PATCH 30/50] make style --- inference/core/workflows/core_steps/analytics/overlap/v1.py | 2 +- .../core/workflows/core_steps/sinks/onvif_movement/v1.py | 4 ++-- inference/core/workflows/core_steps/sinks/twilio/sms/v2.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/inference/core/workflows/core_steps/analytics/overlap/v1.py b/inference/core/workflows/core_steps/analytics/overlap/v1.py index 8404e0681a..7bc73b9d39 100644 --- a/inference/core/workflows/core_steps/analytics/overlap/v1.py +++ b/inference/core/workflows/core_steps/analytics/overlap/v1.py @@ -132,7 +132,7 @@ def coords_overlap( # coords are [x1, y1, x2, y2] if overlap_type == "Center Overlap": size = [other[2] - other[0], other[3] - other[1]] - (x, y) = [other[0] + size[0] / 2, other[1] + size[1] / 2] + x, y = [other[0] + size[0] / 2, other[1] + size[1] / 2] return ( x > overlap[0] and x < overlap[2] and y > overlap[1] and y < overlap[3] ) diff --git a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py index a26f792e73..8351486ad7 100644 --- a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py +++ b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py @@ -874,8 +874,8 @@ def move_camera( xyxy = prediction.xyxy # calculate centers - (x1, y1, x2, y2) = tuple(xyxy[0]) - (image_height, image_width) = tuple(image_dimensions[0]) + x1, y1, x2, y2 = tuple(xyxy[0]) + image_height, image_width = tuple(image_dimensions[0]) center_point = (x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2) # calculate deltas from center and edge diff --git a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py index 26c3539881..4bb7493851 100644 --- a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py +++ b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py @@ -518,7 +518,7 @@ def format_message( def process_media_urls_for_twilio( - media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData] + media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData], ) -> Optional[List[str]]: """ Process media URLs for Twilio MMS. @@ -609,7 +609,7 @@ def _get_mms_placeholder_image_url() -> Optional[str]: def serialize_media_for_api( - media_url: Union[str, List[str], WorkflowImageData, None] + media_url: Union[str, List[str], WorkflowImageData, None], ) -> Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]: """ Serialize media for API transmission. From a386f3ba6ebf08ef40938ab94cb43d7c80256ede Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:50:38 +0000 Subject: [PATCH 31/50] update default and fix profiling script --- .../development/profiling/profile_rfdetr_trt_cudagraphs.py | 3 ++- inference_models/inference_models/models/common/trt.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index 733d462216..fe43027db7 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -7,6 +7,7 @@ from tqdm import tqdm from inference_models import AutoModel +from inference_models.models.common.trt import TRTCudaGraphLRUCache IMAGE_PATH = os.environ.get("IMAGE_PATH", None) DEVICE = os.environ.get("DEVICE", "cuda:0") @@ -40,7 +41,7 @@ def main() -> None: print("Timing with forced CUDA graph recapture each step...") start = time.perf_counter() for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) model.forward(pre_processed, use_cuda_graph=True) cudagraph_recapture_fps = 100 / (time.perf_counter() - start) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 19553cb48f..486312bd1b 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -69,7 +69,7 @@ class TRTCudaGraphState: class TRTCudaGraphLRUCache: - def __init__(self, capacity: int = 64): + def __init__(self, capacity: int = 16): self.cache: OrderedDict[ Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState ] = OrderedDict() From 5f4d3ead36de1cf146e92b361c43f9fce0008b27 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Wed, 11 Feb 2026 21:54:54 +0000 Subject: [PATCH 32/50] fix imports in trt tests --- .../models/test_rfdetr_predictions_trt.py | 6 +++--- .../models/test_rfdetr_seg_predictions_trt.py | 5 ++--- .../test_yolov8_object_detection_predictions_trt.py | 9 +++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index 44ac3dec22..e84dd1bca5 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -4,8 +4,6 @@ import pytest import torch -from inference_models.models.common.trt import TRTCudaGraphLRUCache - @pytest.mark.slow @pytest.mark.trt_extras @@ -15,6 +13,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( bike_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, @@ -27,7 +26,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) @@ -64,6 +63,7 @@ def test_trt_outputs_match_expected_shapes( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py index 2e8c9759fe..16cf30512d 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -2,8 +2,6 @@ import pytest import torch -from inference_models.models.common.trt import TRTCudaGraphLRUCache - @pytest.mark.slow @pytest.mark.trt_extras @@ -13,6 +11,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_seg_nano_t4_trt_package, @@ -25,7 +24,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index bfd9061a3c..35752e2abc 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -2,8 +2,6 @@ import pytest import torch -from inference_models.models.common.trt import TRTCudaGraphLRUCache - @pytest.mark.slow @pytest.mark.trt_extras @@ -12,6 +10,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( @@ -20,7 +19,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( ) pre_processed_single, _ = model.pre_process(dog_image_numpy) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) seen_shapes = set() capture_outputs = {} @@ -58,6 +57,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( @@ -71,7 +71,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( no_graph = model.forward(batch, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache() + model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) capture_graph = model.forward(batch, use_cuda_graph=True) replay_graph = model.forward(batch, use_cuda_graph=True) @@ -86,6 +86,7 @@ def test_trt_cudagraph_cache_eviction( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel + from inference_models.models.common.trt import TRTCudaGraphLRUCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( From 3f3be28b886856c4d053d13b64ae58278eeee019 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Mar 2026 18:02:34 +0000 Subject: [PATCH 33/50] further merge conflict resolution --- .../models/rfdetr/rfdetr_instance_segmentation_trt.py | 1 - .../models/yolov8/yolov8_object_detection_trt.py | 1 - 2 files changed, 2 deletions(-) diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 6112881c78..6c056b78f9 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -6,7 +6,6 @@ from inference_models import InstanceDetections, InstanceSegmentationModel from inference_models.configuration import ( - ( DEFAULT_DEVICE, INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE, USE_CUDA_GRAPHS_FOR_TRT_BACKEND, diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index 47f89bfa7a..c46ea7578c 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -6,7 +6,6 @@ from inference_models import Detections, ObjectDetectionModel from inference_models.configuration import ( - ( DEFAULT_DEVICE, INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CLASS_AGNOSTIC_NMS, INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE, From 24b8ed48ef223c8430a7064a61f5a80e90bf0c16 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Mar 2026 18:07:15 +0000 Subject: [PATCH 34/50] Revert accidental formatting changes unrelated to branch --- inference/core/workflows/core_steps/analytics/overlap/v1.py | 2 +- .../core/workflows/core_steps/sinks/onvif_movement/v1.py | 4 ++-- inference/core/workflows/core_steps/sinks/twilio/sms/v2.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/inference/core/workflows/core_steps/analytics/overlap/v1.py b/inference/core/workflows/core_steps/analytics/overlap/v1.py index 7bc73b9d39..8404e0681a 100644 --- a/inference/core/workflows/core_steps/analytics/overlap/v1.py +++ b/inference/core/workflows/core_steps/analytics/overlap/v1.py @@ -132,7 +132,7 @@ def coords_overlap( # coords are [x1, y1, x2, y2] if overlap_type == "Center Overlap": size = [other[2] - other[0], other[3] - other[1]] - x, y = [other[0] + size[0] / 2, other[1] + size[1] / 2] + (x, y) = [other[0] + size[0] / 2, other[1] + size[1] / 2] return ( x > overlap[0] and x < overlap[2] and y > overlap[1] and y < overlap[3] ) diff --git a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py index 8351486ad7..a26f792e73 100644 --- a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py +++ b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py @@ -874,8 +874,8 @@ def move_camera( xyxy = prediction.xyxy # calculate centers - x1, y1, x2, y2 = tuple(xyxy[0]) - image_height, image_width = tuple(image_dimensions[0]) + (x1, y1, x2, y2) = tuple(xyxy[0]) + (image_height, image_width) = tuple(image_dimensions[0]) center_point = (x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2) # calculate deltas from center and edge diff --git a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py index 4bb7493851..26c3539881 100644 --- a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py +++ b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py @@ -518,7 +518,7 @@ def format_message( def process_media_urls_for_twilio( - media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData], + media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData] ) -> Optional[List[str]]: """ Process media URLs for Twilio MMS. @@ -609,7 +609,7 @@ def _get_mms_placeholder_image_url() -> Optional[str]: def serialize_media_for_api( - media_url: Union[str, List[str], WorkflowImageData, None], + media_url: Union[str, List[str], WorkflowImageData, None] ) -> Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]: """ Serialize media for API transmission. From 574e684880b0d4cca83620b5ee38104dbde856c5 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 3 Mar 2026 18:10:53 +0000 Subject: [PATCH 35/50] set this feature flag to false by default --- inference_models/inference_models/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py index e31f4859c6..dc84aae176 100644 --- a/inference_models/inference_models/configuration.py +++ b/inference_models/inference_models/configuration.py @@ -87,7 +87,7 @@ USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env( variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND", - default=True, + default=False, ) # General model parameters defaults From 077732d8db31cd5734b17e8d2a175ea4c33ce2f1 Mon Sep 17 00:00:00 2001 From: Kai Christensen Date: Tue, 10 Mar 2026 21:40:31 +0000 Subject: [PATCH 36/50] fix cache profiling script --- .../development/profiling/profile_yolov8_trt_cudagraphs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py index 9506b6b1ed..ebbe543a70 100644 --- a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py @@ -12,6 +12,8 @@ WARMUP = int(os.environ.get("WARMUP", "50")) RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100")) +os.environ["USE_TRT_CUDA_GRAPHS"] = "True" + BATCH_SIZES = [1, 2, 3] @@ -57,14 +59,14 @@ def main() -> None: torch.cuda.synchronize() start = time.perf_counter() for i in range(RECAPTURE_CYCLES): - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache.cache.clear() batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]] model.forward(batch, use_cuda_graph=True) torch.cuda.synchronize() recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start) # ── (3) Cycling batch sizes, CUDA graphs with normal caching ──────── - model._trt_cuda_graph_cache = None + model._trt_cuda_graph_cache.cache.clear() for batch in batches.values(): model.forward(batch, use_cuda_graph=True) From b7ea2a09cc6fe2a0bee5475336fab3c6f22af3d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 18:04:11 +0100 Subject: [PATCH 37/50] Add changes to TRT CUDA Graphs cache --- .../profiling/profile_cudagraph_vram.py | 4 +- .../profile_rfdetr_trt_cudagraphs.py | 4 +- .../inference_models/configuration.py | 10 +- .../inference_models/developer_tools.py | 14 +- .../inference_models/models/common/trt.py | 120 +++++++++++++----- .../deep_lab_v3_plus_segmentation_trt.py | 14 ++ .../resnet/resnet_classification_trt.py | 26 ++++ .../rfdetr_instance_segmentation_trt.py | 28 ++-- .../rfdetr/rfdetr_object_detection_trt.py | 28 ++-- .../models/vit/vit_classification_trt.py | 26 ++++ .../yolact_instance_segmentation_trt.py | 14 ++ .../yolo26_instance_segmentation_trt.py | 14 ++ .../yolo26/yolo26_key_points_detection_trt.py | 15 ++- .../yolo26/yolo26_object_detection_trt.py | 14 ++ .../yolonas/yolonas_object_detection_trt.py | 20 ++- .../yolov10/yolov10_object_detection_trt.py | 14 ++ .../yolov5_instance_segmentation_trt.py | 18 ++- .../yolov5/yolov5_object_detection_trt.py | 20 ++- .../yolov7_instance_segmentation_trt.py | 18 ++- .../yolov8_instance_segmentation_trt.py | 14 ++ .../yolov8/yolov8_key_points_detection_trt.py | 15 ++- .../yolov8/yolov8_object_detection_trt.py | 28 ++-- .../models/test_rfdetr_predictions_trt.py | 13 +- .../models/test_rfdetr_seg_predictions_trt.py | 5 +- ...yolov8_object_detection_predictions_trt.py | 19 +-- 25 files changed, 404 insertions(+), 111 deletions(-) diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py index 6996c3b98b..d129fc38c1 100644 --- a/inference_models/development/profiling/profile_cudagraph_vram.py +++ b/inference_models/development/profiling/profile_cudagraph_vram.py @@ -50,7 +50,7 @@ import torch from inference_models import AutoModel -from inference_models.models.common.trt import TRTCudaGraphLRUCache +from inference_models.models.common.trt import TRTCudaGraphCache MODEL_ID = "yolov8n-640" MB = 1024 ** 2 @@ -106,7 +106,7 @@ def main() -> None: baseline_gpu = gpu_used_bytes(device) baseline_cpu = cpu_rss_bytes() - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache( + model._trt_cuda_graph_cache = TRTCudaGraphCache( capacity=args.cache_capacity, ) diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py index fe43027db7..2791e24e3e 100644 --- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py +++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py @@ -7,7 +7,7 @@ from tqdm import tqdm from inference_models import AutoModel -from inference_models.models.common.trt import TRTCudaGraphLRUCache +from inference_models.models.common.trt import TRTCudaGraphCache IMAGE_PATH = os.environ.get("IMAGE_PATH", None) DEVICE = os.environ.get("DEVICE", "cuda:0") @@ -41,7 +41,7 @@ def main() -> None: print("Timing with forced CUDA graph recapture each step...") start = time.perf_counter() for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model.forward(pre_processed, use_cuda_graph=True) cudagraph_recapture_fps = 100 / (time.perf_counter() - start) diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py index 5d732c2060..30e34a0c67 100644 --- a/inference_models/inference_models/configuration.py +++ b/inference_models/inference_models/configuration.py @@ -88,11 +88,6 @@ "ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA" ) -USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env( - variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND", - default=False, -) - # General model parameters defaults INFERENCE_MODELS_DEFAULT_CONFIDENCE = get_float_from_env( @@ -382,3 +377,8 @@ variable_name="INFERENCE_MODELS_YOLOLITE_DEFAULT_CLASS_AGNOSTIC_NMS", default=INFERENCE_MODELS_DEFAULT_CLASS_AGNOSTIC_NMS, ) + +ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME = ( + "ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND" +) +DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND = False diff --git a/inference_models/inference_models/developer_tools.py b/inference_models/inference_models/developer_tools.py index bee4b5ca75..44cbd90da7 100644 --- a/inference_models/inference_models/developer_tools.py +++ b/inference_models/inference_models/developer_tools.py @@ -13,7 +13,7 @@ along with library. Utilities depending on optional dependencies are exposed as lazy imports. """ -from typing import Any, Dict +from typing import Any, Dict, Union from inference_models.models.common.model_packages import get_model_package_contents from inference_models.runtime_introspection.core import ( @@ -21,7 +21,7 @@ x_ray_runtime_environment, ) from inference_models.utils.download import download_files_to_directory -from inference_models.utils.imports import LazyFunction +from inference_models.utils.imports import LazyClass, LazyFunction from inference_models.utils.onnx_introspection import ( get_selected_onnx_execution_providers, ) @@ -42,7 +42,7 @@ TRTPackageDetails, ) -OPTIONAL_IMPORTS: Dict[str, LazyFunction] = { +OPTIONAL_IMPORTS: Dict[str, Union[LazyFunction, LazyClass]] = { "use_primary_cuda_context": LazyFunction( module_name="inference_models.models.common.cuda", function_name="use_primary_cuda_context", @@ -79,6 +79,14 @@ module_name="inference_models.models.common.trt", function_name="load_trt_model", ), + "establish_trt_cuda_graph_cache": LazyFunction( + module_name="inference_models.models.common.trt", + function_name="establish_trt_cuda_graph_cache", + ), + "TRTCudaGraphCache": LazyClass( + module_name="inference_models.models.common.trt", + class_name="TRTCudaGraphCache", + ), } diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 0ca05fab04..accc1cb0ee 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -1,9 +1,14 @@ -from typing import List, Optional, Tuple -from dataclasses import dataclass +import threading from collections import OrderedDict +from dataclasses import dataclass +from typing import List, Optional, Tuple import torch +from inference_models.configuration import ( + DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND, + ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME, +) from inference_models.errors import ( CorruptedModelPackageError, MissingDependencyError, @@ -11,6 +16,7 @@ ) from inference_models.logger import LOGGER from inference_models.models.common.roboflow.model_packages import TRTConfig +from inference_models.utils.environment import get_boolean_from_env try: import tensorrt as trt @@ -74,38 +80,87 @@ class TRTCudaGraphState: execution_context: trt.IExecutionContext -class TRTCudaGraphLRUCache: - def __init__(self, capacity: int = 16): - self.cache: OrderedDict[ +class TRTCudaGraphCache: + def __init__(self, capacity: int): + self._cache: OrderedDict[ Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState ] = OrderedDict() - self.capacity = capacity + self._capacity = capacity + self._state_lock = threading.RLock() + + def get_current_size(self) -> int: + return len(self._cache) + + def list_keys(self) -> List[Tuple[Tuple[int, ...], torch.dtype, torch.device]]: + return list(self._cache.keys()) + + def safe_remove( + self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] + ) -> None: + with self._state_lock: + if key not in self._cache: + return None + evicted = self._cache.pop(key) + self._evict(evicted=evicted) + return None + + def purge(self, n_oldest: Optional[int] = None) -> None: + with self._state_lock: + if n_oldest is None: + n_oldest = len(self._cache) + to_evict = min(len(self._cache), n_oldest) + for _ in range(to_evict): + _, evicted = self._cache.popitem(last=False) + self._evict(evicted=evicted, empty_cuda_cache=False) + torch.cuda.empty_cache() def __contains__( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] ) -> bool: - return key in self.cache + return key in self._cache def __getitem__( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] ) -> TRTCudaGraphState: - value = self.cache[key] - self.cache.move_to_end(key) - return value + with self._state_lock: + value = self._cache[key] + self._cache.move_to_end(key) + return value def __setitem__( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device], value: TRTCudaGraphState, ): - self.cache[key] = value - self.cache.move_to_end(key) - if len(self.cache) > self.capacity: - _, evicted = self.cache.popitem(last=False) - del evicted.cuda_graph - del evicted.input_buffer - del evicted.output_buffers - del evicted.execution_context + with self._state_lock: + self._cache[key] = value + self._cache.move_to_end(key) + if len(self._cache) > self._capacity: + _, evicted = self._cache.popitem(last=False) + self._evict(evicted=evicted) + + def _evict(self, evicted: TRTCudaGraphState, empty_cuda_cache: bool = True) -> None: + del evicted.cuda_graph + del evicted.input_buffer + del evicted.output_buffers + del evicted.execution_context + if empty_cuda_cache: + torch.cuda.empty_cache() + + +def establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size: int, + cuda_graph_cache: Optional[TRTCudaGraphCache] = None, +) -> Optional[TRTCudaGraphCache]: + if cuda_graph_cache is not None: + return cuda_graph_cache + auto_cuda_graphs_enabled = get_boolean_from_env( + variable_name=ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME, + default=DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND, + ) + if not auto_cuda_graphs_enabled: + return None + return TRTCudaGraphCache(capacity=default_cuda_graph_cache_size) def get_trt_engine_inputs_and_outputs( @@ -174,12 +229,12 @@ def infer_from_trt_engine( pre_processed_images: torch.Tensor, trt_config: TRTConfig, engine: trt.ICudaEngine, + context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], - context: Optional[trt.IExecutionContext] = None, - trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, stream: Optional[torch.cuda.Stream] = None, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, ) -> List[torch.Tensor]: """Run inference using a TensorRT engine, optionally with CUDA graph acceleration. @@ -276,8 +331,8 @@ def infer_from_trt_engine( Run with CUDA graph acceleration: - >>> from inference_models.models.common.trt import TRTCudaGraphLRUCache - >>> cache = TRTCudaGraphLRUCache(capacity=16) + >>> from inference_models.models.common.trt import TRTCudaGraphCache + >>> cache = TRTCudaGraphCache(capacity=16) >>> >>> results = infer_from_trt_engine( ... pre_processed_images=images, @@ -324,11 +379,11 @@ def _infer_from_trt_engine( pre_processed_images: torch.Tensor, trt_config: TRTConfig, engine: trt.ICudaEngine, - context: Optional[trt.IExecutionContext], + context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], - trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, ) -> List[torch.Tensor]: if trt_config.static_batch_size is not None: min_batch_size = trt_config.static_batch_size @@ -352,13 +407,13 @@ def _infer_from_trt_engine( def _infer_from_trt_engine_with_batch_size_boundaries( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, - context: Optional[trt.IExecutionContext], + context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], min_batch_size: int, max_batch_size: int, - trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, ) -> List[torch.Tensor]: if pre_processed_images.shape[0] <= max_batch_size: reminder = min_batch_size - pre_processed_images.shape[0] @@ -423,11 +478,11 @@ def _infer_from_trt_engine_with_batch_size_boundaries( def _execute_trt_engine( pre_processed_images: torch.Tensor, engine: trt.ICudaEngine, - context: Optional[trt.IExecutionContext], + context: trt.IExecutionContext, device: torch.device, input_name: str, outputs: List[str], - trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, ) -> List[torch.Tensor]: if trt_cuda_graph_cache is not None: input_shape = tuple(pre_processed_images.shape) @@ -458,11 +513,6 @@ def _execute_trt_engine( return results else: - if context is None: - raise ModelRuntimeError( - message="An execution context is required when not using CUDA graphs.", - help_url="https://inference-models.roboflow.com/errors/models-runtime/#modelruntimeerror", - ) status = context.set_input_shape(input_name, tuple(pre_processed_images.shape)) if not status: raise ModelRuntimeError( @@ -511,7 +561,9 @@ def _capture_cuda_graph( input_buffer = torch.empty_like(pre_processed_images, device=device) input_buffer.copy_(pre_processed_images) - status = graph_context.set_input_shape(input_name, tuple(pre_processed_images.shape)) + status = graph_context.set_input_shape( + input_name, tuple(pre_processed_images.shape) + ) if not status: raise ModelRuntimeError( message="Failed to set TRT model input shape during CUDA graph capture.", diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py index c807f4a641..2bc949760a 100644 --- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py +++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py @@ -38,6 +38,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -81,6 +83,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "DeepLabV3PlusForSemanticSegmentationTRT": if device.type != "cuda": @@ -146,6 +150,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -157,6 +165,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -171,6 +180,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -182,6 +192,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -212,8 +223,10 @@ def pre_process( def forward( self, pre_processed_images: PreprocessedInputs, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -225,6 +238,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py index e55a999515..8bad13c294 100644 --- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py +++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py @@ -40,6 +40,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -81,6 +83,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "ResNetForClassificationTRT": if device.type != "cuda": @@ -147,6 +151,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -157,6 +165,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -170,6 +179,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -180,6 +190,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -212,8 +223,10 @@ def pre_process( def forward( self, pre_processed_images: PreprocessedInputs, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -225,6 +238,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( @@ -271,6 +285,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "ResNetForMultiLabelClassificationTRT": if device.type != "cuda": @@ -337,6 +353,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -347,6 +367,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -360,6 +381,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -370,6 +392,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -402,8 +425,10 @@ def pre_process( def forward( self, pre_processed_images: PreprocessedInputs, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -415,6 +440,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py index 097d374465..ebc59bfdf9 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py @@ -12,7 +12,6 @@ from inference_models.configuration import ( DEFAULT_DEVICE, INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE, - USE_CUDA_GRAPHS_FOR_TRT_BACKEND, ) from inference_models.entities import ColorFormat from inference_models.errors import ( @@ -35,7 +34,8 @@ parse_trt_config, ) from inference_models.models.common.trt import ( - TRTCudaGraphLRUCache, + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -93,7 +93,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, - cuda_graph_cache_capacity: int = 16, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "RFDetrForInstanceSegmentationTRT": if device.type != "cuda": @@ -162,6 +163,10 @@ def from_pretrained( message=f"Implementation assume 3 model outputs, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -173,7 +178,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, - cuda_graph_cache_capacity=cuda_graph_cache_capacity, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -188,7 +193,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, - cuda_graph_cache_capacity: int = 64, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -200,9 +205,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( - capacity=cuda_graph_cache_capacity, - ) + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = threading.Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -235,20 +238,17 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, - use_cuda_graph: Optional[bool] = None, + disable_cuda_graphs: bool = False, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - if use_cuda_graph is None: - use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND - - cache = self._trt_cuda_graph_cache if use_cuda_graph else None + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): detections, labels, masks = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context if not use_cuda_graph else None, + context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py index dfe587d095..cecc0e4c9d 100644 --- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py +++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py @@ -8,7 +8,6 @@ from inference_models.configuration import ( DEFAULT_DEVICE, INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE, - USE_CUDA_GRAPHS_FOR_TRT_BACKEND, ) from inference_models.entities import ColorFormat from inference_models.errors import ( @@ -34,7 +33,8 @@ rescale_image_detections, ) from inference_models.models.common.trt import ( - TRTCudaGraphLRUCache, + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -86,7 +86,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, - cuda_graph_cache_capacity: int = 16, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "RFDetrForObjectDetectionTRT": if device.type != "cuda": @@ -160,6 +161,10 @@ def from_pretrained( message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -171,7 +176,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, - cuda_graph_cache_capacity=cuda_graph_cache_capacity, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -186,7 +191,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, - cuda_graph_cache_capacity: int = 64, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -198,9 +203,7 @@ def __init__( self._cuda_context = cuda_context self._execution_context = execution_context self._trt_config = trt_config - self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( - capacity=cuda_graph_cache_capacity, - ) + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = threading.Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -231,20 +234,17 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, - use_cuda_graph: Optional[bool] = None, + disable_cuda_graphs: bool = False, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: - if use_cuda_graph is None: - use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND - - cache = self._trt_cuda_graph_cache if use_cuda_graph else None + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): detections, labels = infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context if not use_cuda_graph else None, + context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py index 948d544d56..0ed60ff0f0 100644 --- a/inference_models/inference_models/models/vit/vit_classification_trt.py +++ b/inference_models/inference_models/models/vit/vit_classification_trt.py @@ -40,6 +40,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -81,6 +83,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "VITForClassificationTRT": if device.type != "cuda": @@ -147,6 +151,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -157,6 +165,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -170,6 +179,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -180,6 +190,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -210,8 +221,10 @@ def pre_process( def forward( self, pre_processed_images: PreprocessedInputs, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -223,6 +236,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( @@ -270,6 +284,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "VITForMultiLabelClassificationTRT": if device.type != "cuda": @@ -336,6 +352,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -346,6 +366,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -359,6 +380,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -369,6 +391,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -399,8 +422,10 @@ def pre_process( def forward( self, pre_processed_images: PreprocessedInputs, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -412,6 +437,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py index ab2f2648b4..dfdfaf4a29 100644 --- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py @@ -46,6 +46,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -93,6 +95,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOACTForInstanceSegmentationTRT": if device.type != "cuda": @@ -154,6 +158,10 @@ def from_pretrained( message=f"Implementation assume 5 model outputs, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -164,6 +172,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -177,6 +186,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -187,6 +197,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -217,8 +228,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): ( @@ -239,6 +252,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) ) all_loc_data.append(loc_data) diff --git a/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py b/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py index ca2cbf454f..cf26334653 100644 --- a/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py @@ -44,6 +44,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -89,6 +91,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLO26ForInstanceSegmentationTRT": if device.type != "cuda": @@ -155,6 +159,10 @@ def from_pretrained( message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -165,6 +173,7 @@ def from_pretrained( device=device, execution_context=execution_context, cuda_context=cuda_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -178,6 +187,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -188,6 +198,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -218,8 +229,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): instances, protos = infer_from_trt_engine( @@ -231,6 +244,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) return instances, protos diff --git a/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py b/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py index 5dd7bdc141..ee944775cc 100644 --- a/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py +++ b/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py @@ -45,6 +45,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -88,6 +90,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLO26ForKeyPointsDetectionTRT": if device.type != "cuda": @@ -153,6 +157,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -165,6 +173,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -180,12 +189,14 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name self._output_names = [output_name] self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._class_names = class_names self._skeletons = skeletons self._inference_config = inference_config @@ -193,7 +204,6 @@ def __init__( self._trt_config = trt_config self._device = device self._session_thread_lock = Lock() - self._parsed_key_points_metadata = parsed_key_points_metadata self._key_points_classes_for_instances = torch.tensor( [len(e) for e in self._parsed_key_points_metadata], device=device ) @@ -237,8 +247,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -250,6 +262,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py b/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py index f7c299aa9a..b87666d40d 100644 --- a/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py +++ b/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py @@ -37,6 +37,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -80,6 +82,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLO26ForObjectDetectionTRT": if device.type != "cuda": @@ -141,6 +145,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -151,6 +159,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -164,6 +173,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -174,6 +184,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = threading.Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -204,8 +215,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -217,6 +230,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py index fd8c3c1c59..d74bcc3cb5 100644 --- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py +++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py @@ -38,6 +38,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -83,6 +85,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLONasForObjectDetectionTRT": if device.type != "cuda": @@ -155,6 +159,10 @@ def from_pretrained( help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) # git rid of outputs order and names verification, as YOLO-NAS clearly produces different outputs + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -165,6 +173,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -178,6 +187,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -188,6 +198,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -215,7 +226,13 @@ def pre_process( self._pre_process_stream.synchronize() return pre_processed_images, pre_processing_meta - def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: + def forward( + self, + pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, + **kwargs, + ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): results = infer_from_trt_engine( @@ -227,6 +244,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) return torch.cat(results, dim=-1) diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py index fb1ec11c73..0950f3fd5a 100644 --- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py @@ -38,6 +38,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -80,6 +82,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv10ForObjectDetectionTRT": if device.type != "cuda": @@ -141,6 +145,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -151,6 +159,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -164,6 +173,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -174,6 +184,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -204,8 +215,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -217,6 +230,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py index 71da6f20d7..f3b7af3559 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py @@ -46,6 +46,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -92,6 +94,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv5ForInstanceSegmentationTRT": if device.type != "cuda": @@ -158,6 +162,10 @@ def from_pretrained( message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -168,6 +176,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -181,6 +190,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -191,6 +201,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -219,8 +230,12 @@ def pre_process( return pre_processed_images, pre_processing_meta def forward( - self, pre_processed_images: torch.Tensor, **kwargs + self, + pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, + **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): instances, protos = infer_from_trt_engine( @@ -232,6 +247,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) return instances, protos diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py index d7f671afd1..c61078e3f9 100644 --- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py @@ -38,6 +38,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -82,6 +84,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv5ForObjectDetectionTRT": if device.type != "cuda": @@ -143,6 +147,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -153,6 +161,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -166,6 +175,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -176,6 +186,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -203,7 +214,13 @@ def pre_process( self._pre_process_stream.synchronize() return pre_processed_images, pre_processing_meta - def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: + def forward( + self, + pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, + **kwargs, + ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -215,6 +232,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor: input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py index 9d8090b34e..044295646f 100644 --- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py @@ -47,6 +47,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -92,6 +94,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv7ForInstanceSegmentationTRT": if device.type != "cuda": @@ -154,6 +158,10 @@ def from_pretrained( help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) output_tensors = [outputs[0], outputs[4]] + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -164,6 +172,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -177,6 +186,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -189,6 +199,7 @@ def __init__( self._execution_context = execution_context self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._thread_local_storage = threading.local() @property @@ -215,8 +226,12 @@ def pre_process( return pre_processed_images, pre_processing_meta def forward( - self, pre_processed_images: torch.Tensor, **kwargs + self, + pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, + **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): instances, protos = infer_from_trt_engine( @@ -228,6 +243,7 @@ def forward( input_name=self._input_name, outputs=self._output_tensors, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) return instances, protos diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py index 56c430ccf5..28aa7f5b39 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py @@ -48,6 +48,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -93,6 +95,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv8ForInstanceSegmentationTRT": if device.type != "cuda": @@ -164,6 +168,10 @@ def from_pretrained( message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -174,6 +182,7 @@ def from_pretrained( device=device, execution_context=execution_context, cuda_context=cuda_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -187,6 +196,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -200,6 +210,7 @@ def __init__( self._session_thread_lock = Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() + self._trt_cuda_graph_cache = trt_cuda_graph_cache @property def class_names(self) -> List[str]: @@ -227,8 +238,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> Tuple[torch.Tensor, torch.Tensor]: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): instances, protos = infer_from_trt_engine( @@ -240,6 +253,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, ) return instances, protos diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py index 4adf21965b..cb98489a0c 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py @@ -49,6 +49,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -92,6 +94,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv8ForKeyPointsDetectionTRT": if device.type != "cuda": @@ -162,6 +166,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -174,6 +182,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -189,12 +198,14 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name self._output_names = [output_name] self._cuda_context = cuda_context self._execution_context = execution_context + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._class_names = class_names self._skeletons = skeletons self._inference_config = inference_config @@ -202,7 +213,6 @@ def __init__( self._trt_config = trt_config self._device = device self._session_thread_lock = Lock() - self._parsed_key_points_metadata = parsed_key_points_metadata self._key_points_classes_for_instances = torch.tensor( [len(e) for e in self._parsed_key_points_metadata], device=device ) @@ -246,8 +256,10 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._session_thread_lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( @@ -259,6 +271,7 @@ def forward( input_name=self._input_name, outputs=self._output_names, stream=self._inference_stream, + trt_cuda_graph_cache=cache, )[0] def post_process( diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py index 1e5827b827..1c099fd67d 100644 --- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py +++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py @@ -11,7 +11,6 @@ INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE, INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD, INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS, - USE_CUDA_GRAPHS_FOR_TRT_BACKEND, ) from inference_models.entities import ColorFormat from inference_models.errors import ( @@ -42,7 +41,8 @@ pre_process_network_input, ) from inference_models.models.common.trt import ( - TRTCudaGraphLRUCache, + TRTCudaGraphCache, + establish_trt_cuda_graph_cache, get_trt_engine_inputs_and_outputs, infer_from_trt_engine, load_trt_model, @@ -86,7 +86,8 @@ def from_pretrained( model_name_or_path: str, device: torch.device = DEFAULT_DEVICE, engine_host_code_allowed: bool = False, - cuda_graph_cache_capacity: int = 64, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None, + default_trt_cuda_graph_cache_size: int = 8, **kwargs, ) -> "YOLOv8ForObjectDetectionTRT": if device.type != "cuda": @@ -153,6 +154,10 @@ def from_pretrained( message=f"Implementation assume single model output, found: {len(outputs)}.", help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror", ) + trt_cuda_graph_cache = establish_trt_cuda_graph_cache( + default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size, + cuda_graph_cache=trt_cuda_graph_cache, + ) return cls( engine=engine, input_name=inputs[0], @@ -163,7 +168,7 @@ def from_pretrained( device=device, cuda_context=cuda_context, execution_context=execution_context, - cuda_graph_cache_capacity=cuda_graph_cache_capacity, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) def __init__( @@ -177,7 +182,7 @@ def __init__( device: torch.device, cuda_context: cuda.Context, execution_context: trt.IExecutionContext, - cuda_graph_cache_capacity: int = 16, + trt_cuda_graph_cache: Optional[TRTCudaGraphCache], ): self._engine = engine self._input_name = input_name @@ -188,9 +193,7 @@ def __init__( self._device = device self._cuda_context = cuda_context self._execution_context = execution_context - self._trt_cuda_graph_cache = TRTCudaGraphLRUCache( - capacity=cuda_graph_cache_capacity, - ) + self._trt_cuda_graph_cache = trt_cuda_graph_cache self._lock = threading.Lock() self._inference_stream = torch.cuda.Stream(device=self._device) self._thread_local_storage = threading.local() @@ -221,20 +224,17 @@ def pre_process( def forward( self, pre_processed_images: torch.Tensor, - use_cuda_graph: Optional[bool] = None, + disable_cuda_graphs: bool = False, **kwargs, ) -> torch.Tensor: - if use_cuda_graph is None: - use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND - - cache = self._trt_cuda_graph_cache if use_cuda_graph else None + cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None with self._lock: with use_cuda_context(context=self._cuda_context): return infer_from_trt_engine( pre_processed_images=pre_processed_images, trt_config=self._trt_config, engine=self._engine, - context=self._execution_context if not use_cuda_graph else None, + context=self._execution_context, device=self._device, input_name=self._input_name, outputs=self._output_names, diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index aee7dd7d1c..519bff72f4 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -465,7 +465,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( bike_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, @@ -478,7 +478,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) @@ -508,6 +508,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( atol=1e-6, ) + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_outputs_match_expected_shapes( @@ -515,7 +516,7 @@ def test_trt_outputs_match_expected_shapes( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, @@ -529,12 +530,12 @@ def test_trt_outputs_match_expected_shapes( assert output[0].shape == (1, 300, 4) assert output[1].shape == (1, 300, 91) - output = model.forward(pre_processed, use_cuda_graph=True) # capture + output = model.forward(pre_processed, use_cuda_graph=True) # capture assert output[0].shape == (1, 300, 4) assert output[1].shape == (1, 300, 91) - output = model.forward(pre_processed, use_cuda_graph=True) # replay + output = model.forward(pre_processed, use_cuda_graph=True) # replay assert output[0].shape == (1, 300, 4) - assert output[1].shape == (1, 300, 91) \ No newline at end of file + assert output[1].shape == (1, 300, 91) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py index ab6972febc..3dbbd8bb38 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -265,7 +265,6 @@ def test_trt_package_torch_batch( assert 16050 <= predictions[1].mask.cpu().sum().item() <= 16100 - @pytest.mark.slow @pytest.mark.trt_extras def test_trt_cudagraph_output_matches_non_cudagraph_output( @@ -274,7 +273,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache model = AutoModel.from_pretrained( model_id_or_path=rfdetr_seg_nano_t4_trt_package, @@ -287,7 +286,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) capture_graph = model.forward(pre_processed, use_cuda_graph=True) replay_graph = model.forward(pre_processed, use_cuda_graph=True) diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index 2ba7b0d8d4..b03f3954fa 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -451,7 +451,6 @@ def test_trt_package_torch_batch( ) - @pytest.mark.slow @pytest.mark.trt_extras def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( @@ -459,7 +458,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( @@ -468,7 +467,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( ) pre_processed_single, _ = model.pre_process(dog_image_numpy) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) seen_shapes = set() capture_outputs = {} @@ -506,7 +505,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( @@ -520,7 +519,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( no_graph = model.forward(batch, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) capture_graph = model.forward(batch, use_cuda_graph=True) replay_graph = model.forward(batch, use_cuda_graph=True) @@ -535,7 +534,7 @@ def test_trt_cudagraph_cache_eviction( dog_image_numpy: np.ndarray, ) -> None: from inference_models import AutoModel - from inference_models.models.common.trt import TRTCudaGraphLRUCache + from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") model = AutoModel.from_pretrained( @@ -545,7 +544,7 @@ def test_trt_cudagraph_cache_eviction( pre_processed_single, _ = model.pre_process(dog_image_numpy) capacity = 3 - model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=capacity) + model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=capacity) cache = model._trt_cuda_graph_cache batch_sizes = [1, 2, 3] @@ -573,7 +572,11 @@ def test_trt_cudagraph_cache_eviction( model.forward(batch_5, use_cuda_graph=True) assert len(cache.cache) == capacity - key_3 = (tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), batch_2.dtype, device) + key_3 = ( + tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), + batch_2.dtype, + device, + ) assert key_3 not in cache.cache remaining_keys = list(cache.cache.keys()) From 44030c2b69371b87edf28f4046d024ba2c32e83d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 18:17:52 +0100 Subject: [PATCH 38/50] Fix baseline TRT tests --- .../models/test_rfdetr_predictions_trt.py | 16 ++--- .../models/test_rfdetr_seg_predictions_trt.py | 10 ++-- ...yolov8_object_detection_predictions_trt.py | 59 +++++++++---------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index 519bff72f4..d05af338e8 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -467,9 +467,11 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( from inference_models import AutoModel from inference_models.models.common.trt import TRTCudaGraphCache + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, device=torch.device("cuda:0"), + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed_1, _ = model.pre_process(dog_image_numpy) @@ -477,10 +479,9 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: - no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) - capture_graph = model.forward(pre_processed, use_cuda_graph=True) - replay_graph = model.forward(pre_processed, use_cuda_graph=True) + no_graph = model.forward(pre_processed, disable_cuda_graphs=True) + capture_graph = model.forward(pre_processed) + replay_graph = model.forward(pre_processed) outputs.append((no_graph, capture_graph, replay_graph)) @@ -518,6 +519,7 @@ def test_trt_outputs_match_expected_shapes( from inference_models import AutoModel from inference_models.models.common.trt import TRTCudaGraphCache + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, device=torch.device("cuda:0"), @@ -525,17 +527,17 @@ def test_trt_outputs_match_expected_shapes( pre_processed, _ = model.pre_process(dog_image_numpy) - output = model.forward(pre_processed, use_cuda_graph=False) + output = model.forward(pre_processed, disable_cuda_graphs=True) assert output[0].shape == (1, 300, 4) assert output[1].shape == (1, 300, 91) - output = model.forward(pre_processed, use_cuda_graph=True) # capture + output = model.forward(pre_processed) # capture assert output[0].shape == (1, 300, 4) assert output[1].shape == (1, 300, 91) - output = model.forward(pre_processed, use_cuda_graph=True) # replay + output = model.forward(pre_processed) # replay assert output[0].shape == (1, 300, 4) assert output[1].shape == (1, 300, 91) diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py index 3dbbd8bb38..52644815d6 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -275,9 +275,11 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( from inference_models import AutoModel from inference_models.models.common.trt import TRTCudaGraphCache + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model = AutoModel.from_pretrained( model_id_or_path=rfdetr_seg_nano_t4_trt_package, device=torch.device("cuda:0"), + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed_1, _ = model.pre_process(snake_image_numpy) @@ -285,11 +287,9 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( outputs = [] for pre_processed in [pre_processed_1, pre_processed_2]: - no_graph = model.forward(pre_processed, use_cuda_graph=False) - model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) - capture_graph = model.forward(pre_processed, use_cuda_graph=True) - replay_graph = model.forward(pre_processed, use_cuda_graph=True) - + no_graph = model.forward(pre_processed, disable_cuda_graphs=True) + capture_graph = model.forward(pre_processed) + replay_graph = model.forward(pre_processed) outputs.append((no_graph, capture_graph, replay_graph)) for image_outputs in outputs: diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index b03f3954fa..51248d45cd 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -461,13 +461,14 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model = AutoModel.from_pretrained( model_id_or_path=yolov8n_640_t4_trt_package, device=device, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed_single, _ = model.pre_process(dog_image_numpy) - model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) seen_shapes = set() capture_outputs = {} @@ -477,14 +478,11 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( batch = pre_processed_single.repeat(batch_size, 1, 1, 1) cache_key = (tuple(batch.shape), batch.dtype, device) - cache_before = model._trt_cuda_graph_cache - cache_size_before = len(cache_before.cache) if cache_before is not None else 0 + cache_size_before = len(trt_cuda_graph_cache.cache) - output = model.forward(batch, use_cuda_graph=True) + output = model.forward(batch) - cache_after = model._trt_cuda_graph_cache - assert cache_after is not None - cache_size_after = len(cache_after.cache) + cache_size_after = len(trt_cuda_graph_cache.cache) if cache_key not in seen_shapes: assert cache_size_after == cache_size_before + 1 @@ -495,7 +493,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( assert cache_size_after == cache_size_before assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6) - assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes + assert set(trt_cuda_graph_cache.cache.keys()) == seen_shapes @pytest.mark.slow @@ -508,20 +506,21 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output( from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) model = AutoModel.from_pretrained( model_id_or_path=yolov8n_640_t4_trt_package, device=device, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed_single, _ = model.pre_process(dog_image_numpy) for batch_size in [1, 4]: batch = pre_processed_single.repeat(batch_size, 1, 1, 1) - no_graph = model.forward(batch, use_cuda_graph=False) + no_graph = model.forward(batch, disable_cuda_graphs=True) - model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16) - capture_graph = model.forward(batch, use_cuda_graph=True) - replay_graph = model.forward(batch, use_cuda_graph=True) + capture_graph = model.forward(batch) + replay_graph = model.forward(batch) assert torch.allclose(no_graph, capture_graph, atol=1e-6) assert torch.allclose(no_graph, replay_graph, atol=1e-6) @@ -537,53 +536,53 @@ def test_trt_cudagraph_cache_eviction( from inference_models.models.common.trt import TRTCudaGraphCache device = torch.device("cuda:0") + trt_cuda_graph_cache = TRTCudaGraphCache(capacity=3) model = AutoModel.from_pretrained( model_id_or_path=yolov8n_640_t4_trt_package, device=device, + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed_single, _ = model.pre_process(dog_image_numpy) - capacity = 3 - model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=capacity) - cache = model._trt_cuda_graph_cache batch_sizes = [1, 2, 3] for bs in batch_sizes: batch = pre_processed_single.repeat(bs, 1, 1, 1) - model.forward(batch, use_cuda_graph=True) + model.forward(batch) - assert len(cache.cache) == capacity - keys_before = list(cache.cache.keys()) + assert len(trt_cuda_graph_cache.cache) == 3 + keys_before = list(trt_cuda_graph_cache.list_keys()) batch_4 = pre_processed_single.repeat(4, 1, 1, 1) - model.forward(batch_4, use_cuda_graph=True) + model.forward(batch_4) - assert len(cache.cache) == capacity - assert keys_before[0] not in cache.cache + assert len(trt_cuda_graph_cache.cache) == 3 + keys_after = trt_cuda_graph_cache.list_keys() + assert keys_before[0] not in keys_after for key in keys_before[1:]: - assert key in cache.cache + assert key in keys_after key_4 = (tuple(batch_4.shape), batch_4.dtype, device) - assert key_4 in cache.cache + assert key_4 in trt_cuda_graph_cache batch_2 = pre_processed_single.repeat(2, 1, 1, 1) - model.forward(batch_2, use_cuda_graph=True) + model.forward(batch_2) batch_5 = pre_processed_single.repeat(5, 1, 1, 1) - model.forward(batch_5, use_cuda_graph=True) + model.forward(batch_5) - assert len(cache.cache) == capacity + assert trt_cuda_graph_cache.get_current_size() == 3 key_3 = ( tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), batch_2.dtype, device, ) - assert key_3 not in cache.cache + remaining_keys = trt_cuda_graph_cache.list_keys() + assert key_3 not in remaining_keys - remaining_keys = list(cache.cache.keys()) key_2 = (tuple(batch_2.shape), batch_2.dtype, device) key_5 = (tuple(batch_5.shape), batch_5.dtype, device) assert remaining_keys == [key_4, key_2, key_5] - no_graph = model.forward(batch_5, use_cuda_graph=False) - replay = model.forward(batch_5, use_cuda_graph=True) + no_graph = model.forward(batch_5, disable_cuda_graphs=True) + replay = model.forward(batch_5) assert torch.allclose(no_graph, replay, atol=1e-6) From 917def0f05674e7b90356c2d22525ed4a357f7a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 18:18:47 +0100 Subject: [PATCH 39/50] Bump version --- inference_models/pyproject.toml | 2 +- inference_models/uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inference_models/pyproject.toml b/inference_models/pyproject.toml index 37aedc9c11..e90907440c 100644 --- a/inference_models/pyproject.toml +++ b/inference_models/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "inference-models" -version = "0.20.2" +version = "0.21.0rc1" description = "The new inference engine for Computer Vision models" readme = "README.md" requires-python = ">=3.10,<3.13" diff --git a/inference_models/uv.lock b/inference_models/uv.lock index f539a595de..3470775db6 100644 --- a/inference_models/uv.lock +++ b/inference_models/uv.lock @@ -916,7 +916,7 @@ wheels = [ [[package]] name = "inference-models" -version = "0.20.2" +version = "0.21.0rc1" source = { virtual = "." } dependencies = [ { name = "accelerate" }, From 002a4e451de532e64f741f288e31a4f8f48f994e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 18:34:50 +0100 Subject: [PATCH 40/50] Extend tests with multi-forward-pass cases to see if predictions matches with cuda graphs used --- ...ation_tests_inference_experimental_gpu.yml | 12 ++-- .../test_resnet_classifier_predictions_trt.py | 48 +++++++++++++ .../models/test_rfdetr_predictions_trt.py | 65 +++++++++++++++++ .../models/test_rfdetr_seg_predictions_trt.py | 42 +++++++++++ .../test_vit_classifier_predictions_trt.py | 48 +++++++++++++ ...6_instance_segmentation_predictions_trt.py | 42 +++++++++++ ...o26_keypoints_detection_predictions_trt.py | 44 ++++++++++++ ...yolo26_object_detection_predictions_trt.py | 69 +++++++++++++++++++ .../models/test_yolonas_predictions_trt.py | 69 +++++++++++++++++++ ...olov10_object_detection_predictions_trt.py | 41 +++++++++++ ...8_instance_segmentation_predictions_trt.py | 42 +++++++++++ ...ov8_keypoints_detection_predictions_trt.py | 42 +++++++++++ ...yolov8_object_detection_predictions_trt.py | 69 ++++++++++++++++++- 13 files changed, 626 insertions(+), 7 deletions(-) diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml index af092f41f8..12b9f240dc 100644 --- a/.github/workflows/integration_tests_inference_experimental_gpu.yml +++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml @@ -15,6 +15,7 @@ on: - '' - onnx_extras - trt_extras + - trt_extras_with_cuda_graphs - torch_models - hf_vlm_models python_version: @@ -34,10 +35,11 @@ jobs: matrix: python-version: ["3.12"] extras: - - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto" } - - { install: "trt10", marker: "trt_extras", workers: "auto" } - - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1" } - - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1" } + - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" } + - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" } + - { install: "trt10", marker: "trt_extras_with_cuda_graphs", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" } + - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" } + - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" } steps: - name: 🛎️ Checkout if: ${{ (github.event.inputs.extras == '' || github.event.inputs.extras == matrix.extras.marker) && (github.event.inputs.python_version == '' || github.event.inputs.python_version == matrix.python-version) }} @@ -107,4 +109,4 @@ jobs: timeout-minutes: 25 run: | source .venv/bin/activate - python -m pytest -n ${{ matrix.extras.workers }} -m "${{ matrix.extras.marker }} and not cpu_only" tests/integration_tests + ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND=${{ matrix.extras.enable_auto_cuda_graphs_for_trt }} python -m pytest -n ${{ matrix.extras.workers }} -m "${{ matrix.extras.marker }} and not cpu_only" tests/integration_tests diff --git a/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py b/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py index 528135627d..9c0b059f6e 100644 --- a/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py @@ -73,6 +73,30 @@ def test_single_label_trt_package_torch( assert abs(predictions.confidence[0, 2].item() - 0.9999516010284424) < 1e-3 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_single_label_trt_package_torch_multiple_predictions_in_row( + resnet_single_label_cls_trt_package: str, + bike_image_torch: np.ndarray, +) -> None: + # given + from inference_models.models.resnet.resnet_classification_trt import ( + ResNetForClassificationTRT, + ) + + model = ResNetForClassificationTRT.from_pretrained( + model_name_or_path=resnet_single_label_cls_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(bike_image_torch) + + # then + assert abs(predictions.confidence[0, 2].item() - 0.9999516010284424) < 1e-3 + + @pytest.mark.slow @pytest.mark.trt_extras def test_single_label_trt_package_torch_list( @@ -191,6 +215,30 @@ def test_multi_label_trt_package_torch( assert abs(predictions[0].confidence[2].item() - 0.99951171875) < 1e-3 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_multi_label_trt_package_torch_multiple_predictions_in_row( + resnet_multi_label_cls_trt_package: str, + dog_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.resnet.resnet_classification_trt import ( + ResNetForMultiLabelClassificationTRT, + ) + + model = ResNetForMultiLabelClassificationTRT.from_pretrained( + model_name_or_path=resnet_multi_label_cls_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(dog_image_torch) + + # then + assert abs(predictions[0].confidence[2].item() - 0.99951171875) < 1e-3 + + @pytest.mark.slow @pytest.mark.trt_extras def test_multi_label_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index d05af338e8..feb75b1507 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -243,6 +243,71 @@ def test_trt_package_torch( ) +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + rfdetr_coin_counting_trt_package: str, + coins_counting_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.rfdetr.rfdetr_object_detection_trt import ( + RFDetrForObjectDetectionTRT, + ) + + model = RFDetrForObjectDetectionTRT.from_pretrained( + model_name_or_path=rfdetr_coin_counting_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(coins_counting_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor( + [ + 0.9815, + 0.9674, + 0.9638, + 0.9620, + 0.9584, + 0.9565, + 0.9560, + 0.9543, + 0.9520, + 0.9491, + ] + ).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([4, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [ + [1323, 533, 3071, 1970], + [1708, 2572, 1887, 2760], + [1172, 2635, 1372, 2850], + [1744, 2296, 1914, 2472], + [1464, 2305, 1627, 2475], + [1255, 2063, 1423, 2233], + [1091, 2354, 1253, 2524], + [1508, 1884, 1721, 2093], + [929, 1843, 1091, 2004], + [2681, 802, 2867, 976], + ], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py index 52644815d6..04befce4c7 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py @@ -145,6 +145,48 @@ def test_trt_package_torch( assert 16050 <= predictions[0].mask.cpu().sum().item() <= 16100 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + rfdetr_seg_asl_trt_package: str, + asl_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.rfdetr.rfdetr_instance_segmentation_trt import ( + RFDetrForInstanceSegmentationTRT, + ) + + model = RFDetrForInstanceSegmentationTRT.from_pretrained( + model_name_or_path=rfdetr_seg_asl_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(asl_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor([0.9527]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([20], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[63, 173, 187, 374]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + assert 16050 <= predictions[0].mask.cpu().sum().item() <= 16100 + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py b/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py index 5ea6481333..70b6985ae3 100644 --- a/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py @@ -73,6 +73,30 @@ def test_single_label_trt_package_torch( assert abs(predictions.confidence[0, 2].item() - 0.7300973534584045) < 2e-2 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_single_label_trt_package_torch_multiple_predictions_in_row( + vit_single_label_cls_trt_package: str, + bike_image_torch: np.ndarray, +) -> None: + # given + from inference_models.models.vit.vit_classification_trt import ( + VITForClassificationTRT, + ) + + model = VITForClassificationTRT.from_pretrained( + model_name_or_path=vit_single_label_cls_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(bike_image_torch) + + # then + assert abs(predictions.confidence[0, 2].item() - 0.7300973534584045) < 2e-2 + + @pytest.mark.slow @pytest.mark.trt_extras def test_single_label_trt_package_torch_list( @@ -191,6 +215,30 @@ def test_multi_label_trt_package_torch( assert abs(predictions[0].confidence[2].item() - 0.833984375) < 1e-3 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_multi_label_trt_package_torch_multiple_predictions_in_row( + vit_multi_label_cls_trt_package: str, + dog_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.vit.vit_classification_trt import ( + VITForMultiLabelClassificationTRT, + ) + + model = VITForMultiLabelClassificationTRT.from_pretrained( + model_name_or_path=vit_multi_label_cls_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(dog_image_torch) + + # then + assert abs(predictions[0].confidence[2].item() - 0.833984375) < 1e-3 + + @pytest.mark.slow @pytest.mark.trt_extras def test_multi_label_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py index 65873c080a..14f32ad0a9 100644 --- a/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py @@ -145,6 +145,48 @@ def test_trt_package_torch( assert 16500 <= predictions[0].mask.cpu().sum().item() <= 16600 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolo26_seg_asl_trt_package: str, + asl_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolo26.yolo26_instance_segmentation_trt import ( + YOLO26ForInstanceSegmentationTRT, + ) + + model = YOLO26ForInstanceSegmentationTRT.from_pretrained( + model_name_or_path=yolo26_seg_asl_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(asl_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor([0.9671]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([20], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[63, 174, 186, 368]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + assert 16500 <= predictions[0].mask.cpu().sum().item() <= 16600 + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py index ce74b631f6..c4d8083077 100644 --- a/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py @@ -144,6 +144,50 @@ def test_trt_package_torch( assert abs(predictions[0][0].confidence.sum().item() - 26.268831253051758) < 1e-2 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolo26_pose_trt_package: str, + people_walking_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolo26.yolo26_key_points_detection_trt import ( + YOLO26ForKeyPointsDetectionTRT, + ) + + model = YOLO26ForKeyPointsDetectionTRT.from_pretrained( + model_name_or_path=yolo26_pose_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(people_walking_image_torch) + + # then + assert torch.allclose( + predictions[1][0].confidence.cpu(), + torch.tensor([0.9271, 0.9230]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[1][0].class_id.cpu(), + torch.tensor([0, 0], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[353, 129, 539, 758], [618, 123, 822, 771]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[1][0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + assert ( + abs(predictions[0][0].confidence.sum().item() - 26.268831253051758) < 1e-2 + ) + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py index ddd5823858..811f32f9cb 100644 --- a/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py @@ -247,6 +247,75 @@ def test_trt_package_torch( ) +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolo26_object_detections_coin_counting_trt_package: str, + coins_counting_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolo26.yolo26_object_detection_trt import ( + YOLO26ForObjectDetectionTRT, + ) + + model = YOLO26ForObjectDetectionTRT.from_pretrained( + model_name_or_path=yolo26_object_detections_coin_counting_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(coins_counting_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor( + [ + 0.9837, + 0.9707, + 0.9196, + 0.8495, + 0.8418, + 0.8408, + 0.5737, + 0.4922, + 0.4282, + 0.4273, + 0.2606, + ] + ).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([2, 2, 2, 1, 3, 0, 0, 0, 3, 1, 3], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [ + [ + [1252, 2049, 1431, 2241], + [1741, 2286, 1921, 2480], + [1707, 2565, 1896, 2770], + [1164, 2624, 1382, 2856], + [1502, 1867, 1728, 2096], + [1459, 2296, 1633, 2476], + [923, 1836, 1100, 2009], + [1090, 2346, 1268, 2525], + [1256, 2059, 1425, 2234], + [1164, 2626, 1381, 2857], + [2671, 792, 2875, 979], + ] + ], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py index 21dadc4b49..ce7a2c2a2e 100644 --- a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py @@ -253,6 +253,75 @@ def test_trt_package_torch( ) +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolo_nas_coin_counting_trt_package: str, + coins_counting_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolonas.yolonas_object_detection_trt import ( + YOLONasForObjectDetectionTRT, + ) + + model = YOLONasForObjectDetectionTRT.from_pretrained( + model_name_or_path=yolo_nas_coin_counting_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(coins_counting_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor( + [ + 0.8929, + 0.8762, + 0.8625, + 0.8573, + 0.8434, + 0.7718, + 0.7705, + 0.7628, + 0.6723, + 0.6343, + 0.4533, + 0.4388, + ] + ).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([2, 1, 0, 0, 0, 0, 3, 3, 2, 2, 0, 1], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [ + [1693, 2548, 1910, 2774], + [1161, 2618, 1389, 2868], + [1445, 2291, 1641, 2483], + [913, 1823, 1110, 2017], + [1080, 2334, 1275, 2537], + [1727, 2285, 1931, 2482], + [2664, 763, 2887, 1001], + [1491, 1862, 1740, 2101], + [1727, 2283, 1932, 2487], + [1238, 2041, 1438, 2243], + [1485, 1864, 1743, 2106], + [1236, 2040, 1439, 2245], + ], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py index 00159c653c..e35b16c3a5 100644 --- a/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py @@ -255,3 +255,44 @@ def test_trt_package_torch_batch( expected_xyxy.cpu(), atol=5, ) + + +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolov10_object_detection_trt_package: str, + dog_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolov10.yolov10_object_detection_trt import ( + YOLOv10ForObjectDetectionTRT, + ) + + model = YOLOv10ForObjectDetectionTRT.from_pretrained( + model_name_or_path=yolov10_object_detection_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(dog_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor([0.5039]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([16], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[71, 253, 646, 970]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) diff --git a/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py index 39a27c75df..01c6bd6ee7 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py @@ -145,6 +145,48 @@ def test_trt_package_torch( assert 16100 <= predictions[0].mask.cpu().sum().item() <= 16200 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolov8_seg_asl_trt_package: str, + asl_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolov8.yolov8_instance_segmentation_trt import ( + YOLOv8ForInstanceSegmentationTRT, + ) + + model = YOLOv8ForInstanceSegmentationTRT.from_pretrained( + model_name_or_path=yolov8_seg_asl_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(asl_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor([0.9795]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([20], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[63, 174, 187, 368]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + assert 16100 <= predictions[0].mask.cpu().sum().item() <= 16200 + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py index 03f6e40db0..a6e60b8bd1 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py @@ -144,6 +144,48 @@ def test_trt_package_torch( assert abs(predictions[0][0].confidence.sum().item() - 26.07147979736328) < 1e-2 +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolov8_pose_trt_package: str, + people_walking_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolov8.yolov8_key_points_detection_trt import ( + YOLOv8ForKeyPointsDetectionTRT, + ) + + model = YOLOv8ForKeyPointsDetectionTRT.from_pretrained( + model_name_or_path=yolov8_pose_trt_package, + engine_host_code_allowed=True, + ) + + for _ in range(8): + # when + predictions = model(people_walking_image_torch) + + # then + assert torch.allclose( + predictions[1][0].confidence.cpu(), + torch.tensor([0.8783, 0.8719]).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[1][0].class_id.cpu(), + torch.tensor([0, 0], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [[351, 124, 540, 756], [619, 120, 824, 767]], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[1][0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + assert abs(predictions[0][0].confidence.sum().item() - 26.07147979736328) < 1e-2 + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index 51248d45cd..ba580ed21e 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -237,6 +237,71 @@ def test_trt_package_torch( ) +@pytest.mark.slow +@pytest.mark.trt_extras +def test_trt_package_torch_multiple_predictions_in_row( + yolov8_coin_counting_trt_package: str, + coins_counting_image_torch: torch.Tensor, +) -> None: + # given + from inference_models.models.yolov8.yolov8_object_detection_trt import ( + YOLOv8ForObjectDetectionTRT, + ) + + model = YOLOv8ForObjectDetectionTRT.from_pretrained( + model_name_or_path=yolov8_coin_counting_trt_package, + engine_host_code_allowed=True, + ) + + # when + for _ in range(8): + predictions = model(coins_counting_image_torch) + + # then + assert torch.allclose( + predictions[0].confidence.cpu(), + torch.tensor( + [ + 0.9956, + 0.9727, + 0.9653, + 0.9468, + 0.9448, + 0.9390, + 0.9302, + 0.9287, + 0.9155, + 0.9019, + ] + ).cpu(), + atol=0.01, + ) + assert torch.allclose( + predictions[0].class_id.cpu(), + torch.tensor([4, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32).cpu(), + ) + expected_xyxy = torch.tensor( + [ + [1304, 614, 3024, 1918], + [1714, 2571, 1884, 2759], + [2678, 806, 2866, 974], + [1744, 2294, 1914, 2469], + [1260, 2058, 1424, 2233], + [1469, 2302, 1624, 2467], + [929, 1843, 1091, 1997], + [1514, 1880, 1718, 2089], + [1177, 2632, 1374, 2846], + [1099, 2348, 1260, 2522], + ], + dtype=torch.int32, + ) + assert torch.allclose( + predictions[0].xyxy.cpu(), + expected_xyxy.cpu(), + atol=5, + ) + + @pytest.mark.slow @pytest.mark.trt_extras def test_trt_package_torch_list( @@ -478,7 +543,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( batch = pre_processed_single.repeat(batch_size, 1, 1, 1) cache_key = (tuple(batch.shape), batch.dtype, device) - cache_size_before = len(trt_cuda_graph_cache.cache) + cache_size_before = trt_cuda_graph_cache.get_current_size() output = model.forward(batch) @@ -550,7 +615,7 @@ def test_trt_cudagraph_cache_eviction( batch = pre_processed_single.repeat(bs, 1, 1, 1) model.forward(batch) - assert len(trt_cuda_graph_cache.cache) == 3 + assert trt_cuda_graph_cache.get_current_size() == 3 keys_before = list(trt_cuda_graph_cache.list_keys()) batch_4 = pre_processed_single.repeat(4, 1, 1, 1) From 6648c5cdb873851c71ee8f0cbd179b1d0bba7df9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:08:03 +0100 Subject: [PATCH 41/50] Adjust tests and add docs --- ...ation_tests_inference_experimental_gpu.yml | 4 +- inference_models/docs/changelog.md | 9 + .../docs/how-to/use-cuda-graphs.md | 210 ++++++++++++++++++ inference_models/mkdocs.yml | 1 + .../models/test_rfdetr_predictions_trt.py | 1 + ...yolov8_object_detection_predictions_trt.py | 4 +- 6 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 inference_models/docs/how-to/use-cuda-graphs.md diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml index 12b9f240dc..60fbde84a2 100644 --- a/.github/workflows/integration_tests_inference_experimental_gpu.yml +++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml @@ -28,7 +28,7 @@ on: jobs: integration-tests-inference-models-gpu: - name: ${{ matrix.extras.marker }}:${{ matrix.python-version }} + name: ${{ matrix.extras.marker }}:${{ matrix.python-version }}:cuda-graphs:${{ matrix.extras.enable_auto_cuda_graphs_for_trt }} runs-on: Roboflow-GPU-VM-Runner timeout-minutes: 30 strategy: @@ -37,7 +37,7 @@ jobs: extras: - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" } - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" } - - { install: "trt10", marker: "trt_extras_with_cuda_graphs", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" } + - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" } - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" } - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" } steps: diff --git a/inference_models/docs/changelog.md b/inference_models/docs/changelog.md index 26180bc47f..0ca5d597b7 100644 --- a/inference_models/docs/changelog.md +++ b/inference_models/docs/changelog.md @@ -1,5 +1,14 @@ # Changelog +## `0.21.0` +### Added + +- Support for CUDA Graphs in TRT backend - all TRT models got upgraded - added ability to run with CUDA graphs, at +the expense of additional VRAM allocation, but with caller control on how many execution contexts for different +input shapes should be allowed. + +--- + ## `0.20.2` ### Added diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md new file mode 100644 index 0000000000..b484343d57 --- /dev/null +++ b/inference_models/docs/how-to/use-cuda-graphs.md @@ -0,0 +1,210 @@ +# Using CUDA Graphs with TensorRT Models + +CUDA graphs capture a sequence of GPU operations and replay them as a single unit, eliminating per-call +CPU overhead. For TensorRT models in `inference_models`, this translates to a **7–12% FPS improvement** +on repeated inference with the same input shape. + +## Overview + +When CUDA graphs are enabled, the first `forward()` call for a given input shape captures the TensorRT +execution into a CUDA graph. Subsequent calls with the same shape replay the captured graph instead of +re-launching individual GPU kernels. Captured graphs are stored in an LRU cache keyed by +`(shape, dtype, device)`. + +CUDA graphs work with all TRT model classes that use `infer_from_trt_engine` — including object detection, +instance segmentation, keypoint detection, classification, and semantic segmentation models. + +## Prerequisites + +- A CUDA-capable GPU +- TensorRT installed (brought in by `trt-*` extras of `inference-models`) +- A TRT model package (`.plan` engine file) + +## Quick Start + +The simplest way to enable CUDA graphs is through the `USE_CUDA_GRAPHS_FOR_TRT_BACKEND` environment +variable: + +```bash +export USE_CUDA_GRAPHS_FOR_TRT_BACKEND=True +``` + +With this set, all TRT models loaded via `AutoModel.from_pretrained` will automatically create a CUDA +graph cache and use it during inference. No code changes required. + +```python +import torch +from inference_models import AutoModel + +model = AutoModel.from_pretrained( + model_id_or_path="rfdetr-nano", + device=torch.device("cuda:0"), + backend="trt", +) + +# First call captures the CUDA graph for this input shape +results = model.predict(image) + +# Subsequent calls replay the captured graph — faster +results = model.predict(image) +``` + +## Manual Cache Control + +For more control over cache behavior, create a `TRTCudaGraphCache` explicitly and pass it +to `AutoModel.from_pretrained`: + +```python +import torch +from inference_models import AutoModel +from inference_models.models.common.trt import TRTCudaGraphCache + +cache = TRTCudaGraphCache(capacity=16) + +model = AutoModel.from_pretrained( + model_id_or_path="rfdetr-nano", + device=torch.device("cuda:0"), + backend="trt", + trt_cuda_graph_cache=cache, +) +``` + +The `capacity` parameter controls how many distinct input shapes can be cached simultaneously. +When the cache is full, the least recently used graph is evicted automatically. + +### Inspecting the Cache + +You can query the cache at any time to see what's been captured: + +```python +# Check how many graphs are currently cached +print(cache.get_current_size()) # e.g. 3 + +# List all cached keys — each key is a (shape, dtype, device) tuple +for key in cache.list_keys(): + shape, dtype, device = key + print(f" shape={shape}, dtype={dtype}, device={device}") + +# Check if a specific shape is cached +key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0")) +if key in cache: + print("Graph is cached for this shape") +``` + +### Removing Specific Entries + +Use `safe_remove()` to evict a single cached graph by its key. This releases the associated +CUDA graph, execution context, and GPU buffers immediately. If the key doesn't exist, the +call is a no-op: + +```python +key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0")) +cache.safe_remove(key) +``` + +### Purging the Cache + +Use `purge()` to evict multiple entries at once. When called without arguments, it clears the +entire cache. You can also pass `n_oldest` to evict only the N least recently used entries: + +```python +# Evict the 4 oldest (least recently used) entries +cache.purge(n_oldest=4) + +# Clear the entire cache +cache.purge() +``` + +`purge()` is more efficient than calling `safe_remove()` in a loop because it batches the +GPU memory cleanup — `torch.cuda.empty_cache()` is called once at the end rather than after +each individual eviction. + +!!! tip "When to purge manually" + Manual purging is useful when you know the workload is about to change — for example, + switching from processing video at one resolution to another. Purging stale entries + frees VRAM for the new shapes before they're captured. + +### Sharing a Cache Across Models + +Please **do not share single instance of `TRTCudaGraphCache`** to multiple models - as cache object is bound to +specific model instance. + +### Choosing Cache Capacity + +Each cached graph holds its own TensorRT execution context and GPU memory buffers. A reasonable +default is **8–16 entries**. Consider: + +- **Fixed input shape** (e.g. always 1×3×640×640): `capacity=1` is sufficient. +- **Variable batch sizes** (e.g. batch 1–16): set capacity to the number of distinct batch sizes + you expect, or quantize to powers of two and set `capacity=4–5`. +- **Memory-constrained environments**: lower the capacity to reduce VRAM usage. + +## Disabling CUDA Graphs Per Call + +Even with a cache configured, you can bypass CUDA graphs for individual forward passes using the +`disable_cuda_graphs` flag: + +```python +pre_processed, meta = model.pre_process(image) + +# Standard path — uses CUDA graphs if cache is configured +output = model.forward(pre_processed) + +# Bypass CUDA graphs for this specific call +output = model.forward(pre_processed, disable_cuda_graphs=True) +``` + +This is useful for debugging, benchmarking, or when you need to compare graph vs. non-graph outputs. + + +## How It Works + +The lifecycle of a CUDA graph in `inference_models`: + +1. **Cache miss** — `infer_from_trt_engine` detects that no cached graph exists for the current + `(shape, dtype, device)` key. It creates a dedicated TensorRT execution context, allocates + input/output buffers, runs a warmup pass, then captures the execution into a `torch.cuda.CUDAGraph`. + The graph and its associated state are stored in the cache. + +2. **Cache hit** — On subsequent calls with the same key, the cached graph's input buffer is updated + via `copy_()`, the graph is replayed, and output buffers are cloned and returned. No TensorRT + context setup or kernel launches happen on the CPU side. + +3. **Eviction** — When the cache exceeds its capacity, the least recently used entry is evicted. + The associated CUDA graph, execution context, and GPU buffers are released, and + `torch.cuda.empty_cache()` is called to return memory to the CUDA driver. + + +## Important Considerations + +### VRAM Usage + +Each cache entry consumes GPU memory for input buffers, output buffers, and the TensorRT execution +context's internal workspace. With large models or high cache capacities, this can be significant. +Monitor VRAM usage when tuning `capacity`. + +### Thread Safety + +One may manage cache entries and eviction from separate thread compared to the one running forward-pass. +The cache state is synchronized with thread lock. + +### Dynamic Batch Sizes + +CUDA graphs are shape-specific — a graph captured for batch size 4 cannot be replayed for batch size 8. +If your application uses variable batch sizes, each distinct size will trigger a separate graph capture. +The LRU cache handles this transparently, but be aware that frequent shape changes will cause cache +churn and recapture overhead. + +!!! tip "Quantize batch sizes for better cache utilization" + + If you control the batching logic, round batch sizes up to the nearest power of two + (1, 2, 4, 8, 16). This reduces the number of distinct shapes and keeps the cache small. + +### When CUDA Graphs Won't Help + +- **Cold start / single inference**: The first call for each shape pays the capture cost, which is + slower than a normal forward pass. CUDA graphs only pay off on subsequent replays. +- **Highly variable input shapes**: If every call has a unique shape, graphs are captured but + never replayed. +- **CPU-bound pipelines**: If your bottleneck is preprocessing or postprocessing, the GPU-side + speedup from graph replay won't be visible end-to-end. diff --git a/inference_models/mkdocs.yml b/inference_models/mkdocs.yml index b68348e4e8..56b32f565c 100644 --- a/inference_models/mkdocs.yml +++ b/inference_models/mkdocs.yml @@ -103,6 +103,7 @@ nav: - Load Models Locally: how-to/local-packages.md - Understand Roboflow Model Packages: how-to/roboflow-model-packages.md - Manage Cache: how-to/cache-management.md + - Use CUDA Graphs: how-to/use-cuda-graphs.md - Contributors: - Development Environment: contributors/dev-environment.md - Core Architecture: contributors/core-architecture.md diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py index feb75b1507..b067349920 100644 --- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py @@ -588,6 +588,7 @@ def test_trt_outputs_match_expected_shapes( model = AutoModel.from_pretrained( model_id_or_path=rfdetr_nano_t4_trt_package, device=torch.device("cuda:0"), + trt_cuda_graph_cache=trt_cuda_graph_cache, ) pre_processed, _ = model.pre_process(dog_image_numpy) diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index ba580ed21e..3c3853987f 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -547,7 +547,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( output = model.forward(batch) - cache_size_after = len(trt_cuda_graph_cache.cache) + cache_size_after = trt_cuda_graph_cache.get_current_size() if cache_key not in seen_shapes: assert cache_size_after == cache_size_before + 1 @@ -621,7 +621,7 @@ def test_trt_cudagraph_cache_eviction( batch_4 = pre_processed_single.repeat(4, 1, 1, 1) model.forward(batch_4) - assert len(trt_cuda_graph_cache.cache) == 3 + assert trt_cuda_graph_cache.get_current_size() == 3 keys_after = trt_cuda_graph_cache.list_keys() assert keys_before[0] not in keys_after for key in keys_before[1:]: From f4a2788e0ddb6869785e1a2b6ffd213e6ad90383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:09:45 +0100 Subject: [PATCH 42/50] Adjust docs --- inference_models/docs/how-to/use-cuda-graphs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md index b484343d57..fa86b5d54d 100644 --- a/inference_models/docs/how-to/use-cuda-graphs.md +++ b/inference_models/docs/how-to/use-cuda-graphs.md @@ -22,11 +22,11 @@ instance segmentation, keypoint detection, classification, and semantic segmenta ## Quick Start -The simplest way to enable CUDA graphs is through the `USE_CUDA_GRAPHS_FOR_TRT_BACKEND` environment +The simplest way to enable CUDA graphs is through the `ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND` environment variable: ```bash -export USE_CUDA_GRAPHS_FOR_TRT_BACKEND=True +export ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND=True ``` With this set, all TRT models loaded via `AutoModel.from_pretrained` will automatically create a CUDA From a820aae23dcfa751946fc3b189ce3b0e4d6b77f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:11:33 +0100 Subject: [PATCH 43/50] Adjust docs --- inference_models/docs/how-to/use-cuda-graphs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md index fa86b5d54d..58e1eacd8e 100644 --- a/inference_models/docs/how-to/use-cuda-graphs.md +++ b/inference_models/docs/how-to/use-cuda-graphs.md @@ -57,7 +57,7 @@ to `AutoModel.from_pretrained`: ```python import torch from inference_models import AutoModel -from inference_models.models.common.trt import TRTCudaGraphCache +from inference_models.developer_tools import TRTCudaGraphCache cache = TRTCudaGraphCache(capacity=16) From 4a9b62b7a922c635af0e5fd6d55980d98244fafc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:25:59 +0100 Subject: [PATCH 44/50] Add more docs --- .../trt/establish-trt-cuda-graph-cache.md | 6 + .../trt/get-trt-engine-inputs-and-outputs.md | 2 +- .../trt/trt-cuda-graph-cache.md | 6 + .../inference_models/models/common/trt.py | 207 +++++++++++++++++- inference_models/mkdocs.yml | 2 + ...yolov8_object_detection_predictions_trt.py | 2 +- 6 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md create mode 100644 inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md diff --git a/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md b/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md new file mode 100644 index 0000000000..3442d233ac --- /dev/null +++ b/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md @@ -0,0 +1,6 @@ +# establish_trt_cuda_graph_cache + +::: inference_models.models.common.trt.establish_trt_cuda_graph_cache + options: + show_root_heading: true + show_source: false diff --git a/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md b/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md index 98179cf56c..301102ca68 100644 --- a/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md +++ b/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md @@ -1,4 +1,4 @@ -2# get_trt_engine_inputs_and_outputs +# get_trt_engine_inputs_and_outputs ::: inference_models.models.common.trt.get_trt_engine_inputs_and_outputs options: diff --git a/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md b/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md new file mode 100644 index 0000000000..e074a3c063 --- /dev/null +++ b/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md @@ -0,0 +1,6 @@ +# TRTCudaGraphCache + +::: inference_models.models.common.trt.TRTCudaGraphCache + options: + show_root_heading: true + show_source: false diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index accc1cb0ee..982f9644e5 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -81,6 +81,42 @@ class TRTCudaGraphState: class TRTCudaGraphCache: + + """LRU cache for captured CUDA graphs used in TensorRT inference. + + Stores captured ``torch.cuda.CUDAGraph`` objects keyed by input + ``(shape, dtype, device)`` tuples. When the cache exceeds its capacity, + the least recently used entry is evicted and its GPU resources are released. + + The cache is thread-safe — all mutating operations acquire an internal + ``threading.RLock``. + + Args: + capacity: Maximum number of CUDA graphs to store. Each entry holds + a dedicated TensorRT execution context and GPU memory buffers, + so higher values increase VRAM usage. + + Examples: + Create a cache and pass it to a model: + + >>> from inference_models.models.common.trt import TRTCudaGraphCache + >>> from inference_models import AutoModel + >>> import torch + >>> + >>> cache = TRTCudaGraphCache(capacity=16) + >>> model = AutoModel.from_pretrained( + ... model_id_or_path="rfdetr-nano", + ... device=torch.device("cuda:0"), + ... backend="trt", + ... trt_cuda_graph_cache=cache, + ... ) + + See Also: + - ``establish_trt_cuda_graph_cache()``: Factory that creates a cache + based on environment configuration + - ``infer_from_trt_engine()``: Uses the cache during TRT inference + """ + def __init__(self, capacity: int): self._cache: OrderedDict[ Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState @@ -89,14 +125,67 @@ def __init__(self, capacity: int): self._state_lock = threading.RLock() def get_current_size(self) -> int: - return len(self._cache) + """Return the number of CUDA graphs currently stored in the cache. + + Returns: + Number of cached entries. + + Examples: + >>> cache = TRTCudaGraphCache(capacity=16) + >>> cache.get_current_size() + 0 + """ + with self._state_lock: + return len(self._cache) def list_keys(self) -> List[Tuple[Tuple[int, ...], torch.dtype, torch.device]]: - return list(self._cache.keys()) + """Return a list of all keys currently in the cache. + + Each key is a ``(shape, dtype, device)`` tuple representing a cached + CUDA graph. Keys are returned in insertion order (oldest first), which + reflects eviction priority. + + Returns: + List of ``(shape, dtype, device)`` tuples for all cached entries. + + Examples: + >>> cache = TRTCudaGraphCache(capacity=16) + >>> # ... after some forward passes ... + >>> for shape, dtype, device in cache.list_keys(): + ... print(f"Cached: shape={shape}, dtype={dtype}") + """ + with self._state_lock: + return list(self._cache.keys()) def safe_remove( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] ) -> None: + """Remove a single entry from the cache by its key. + + If the key exists, the associated CUDA graph, execution context, and + GPU buffers are released and ``torch.cuda.empty_cache()`` is called. + If the key does not exist, this method is a no-op. + + Args: + key: A ``(shape, dtype, device)`` tuple identifying the entry + to remove. + + Examples: + Remove a cached graph for a specific input shape: + + >>> import torch + >>> key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0")) + >>> cache.safe_remove(key) + + Safe to call with a non-existent key: + + >>> cache.safe_remove(((99, 99), torch.float32, torch.device("cuda:0"))) + >>> # no error raised + + See Also: + - ``purge()``: Remove multiple entries at once with batched + GPU memory cleanup + """ with self._state_lock: if key not in self._cache: return None @@ -105,6 +194,40 @@ def safe_remove( return None def purge(self, n_oldest: Optional[int] = None) -> None: + """Remove entries from the cache, starting with the least recently used. + + When called without arguments, clears the entire cache. When + ``n_oldest`` is specified, only that many entries are evicted + (or all entries if the cache contains fewer). + + GPU memory cleanup (``torch.cuda.empty_cache()``) is called once + after all evictions, making this more efficient than calling + ``safe_remove()`` in a loop. + + Args: + n_oldest: Number of least recently used entries to evict. + When ``None`` (default), all entries are removed. + + Examples: + Evict the 4 oldest entries: + + >>> cache.purge(n_oldest=4) + + Clear the entire cache: + + >>> cache.purge() + >>> cache.get_current_size() + 0 + + Note: + - Eviction order follows LRU policy — entries that haven't been + accessed recently are removed first + - Each evicted entry's CUDA graph, execution context, and GPU + buffers are released + + See Also: + - ``safe_remove()``: Remove a single entry by key + """ with self._state_lock: if n_oldest is None: n_oldest = len(self._cache) @@ -117,7 +240,8 @@ def purge(self, n_oldest: Optional[int] = None) -> None: def __contains__( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] ) -> bool: - return key in self._cache + with self._state_lock: + return key in self._cache def __getitem__( self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device] @@ -152,6 +276,83 @@ def establish_trt_cuda_graph_cache( default_cuda_graph_cache_size: int, cuda_graph_cache: Optional[TRTCudaGraphCache] = None, ) -> Optional[TRTCudaGraphCache]: + """Establish a CUDA graph cache for TensorRT inference acceleration. + + Resolves which CUDA graph cache to use for a TRT model. If the caller + provides a cache instance, it is returned as-is. Otherwise, the function + checks the ``ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND`` environment variable + to decide whether to create a new cache automatically. When the environment + variable is disabled (the default), no cache is created and CUDA graphs + are not used. + + This function is typically called inside ``from_pretrained()`` of TRT model + classes. End users who want explicit control should create a + ``TRTCudaGraphCache`` themselves and pass it to ``AutoModel.from_pretrained``. + + Args: + default_cuda_graph_cache_size: Maximum number of CUDA graphs to cache + when a new cache is created automatically. Each entry holds a + dedicated TensorRT execution context and GPU memory buffers, so + higher values increase VRAM usage. + + cuda_graph_cache: Optional pre-existing cache instance. When provided, + it is returned directly and the environment variable is ignored. + This allows callers to share a single cache across multiple models + or to configure capacity explicitly. + + Returns: + A ``TRTCudaGraphCache`` instance if CUDA graphs should be used, or + ``None`` if they are disabled. When ``None`` is returned, the model + falls back to standard TensorRT execution without graph capture. + + Examples: + Automatic cache creation via environment variable: + + >>> import os + >>> os.environ["ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND"] = "True" + >>> + >>> from inference_models.models.common.trt import ( + ... establish_trt_cuda_graph_cache, + ... ) + >>> + >>> cache = establish_trt_cuda_graph_cache(default_cuda_graph_cache_size=8) + >>> print(type(cache)) # + + Caller-provided cache takes priority: + + >>> from inference_models.models.common.trt import ( + ... TRTCudaGraphCache, + ... establish_trt_cuda_graph_cache, + ... ) + >>> + >>> my_cache = TRTCudaGraphCache(capacity=32) + >>> result = establish_trt_cuda_graph_cache( + ... default_cuda_graph_cache_size=8, + ... cuda_graph_cache=my_cache, + ... ) + >>> assert result is my_cache # returned as-is + + Typical usage inside a model's from_pretrained: + + >>> cache = establish_trt_cuda_graph_cache( + ... default_cuda_graph_cache_size=8, + ... cuda_graph_cache=None, # let env var decide + ... ) + >>> # cache is None when env var is disabled (default) + + Note: + - The environment variable ``ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND`` + defaults to ``False`` + - When a caller-provided cache is given, the environment variable + is not checked + - CUDA graphs require TensorRT and a CUDA-capable GPU + - Each cached graph consumes VRAM proportional to the model's + execution context size + + See Also: + - ``TRTCudaGraphCache``: The LRU cache class for CUDA graph state + - ``infer_from_trt_engine()``: Uses the cache during TRT inference + """ if cuda_graph_cache is not None: return cuda_graph_cache auto_cuda_graphs_enabled = get_boolean_from_env( diff --git a/inference_models/mkdocs.yml b/inference_models/mkdocs.yml index 56b32f565c..983cc25323 100644 --- a/inference_models/mkdocs.yml +++ b/inference_models/mkdocs.yml @@ -148,6 +148,8 @@ nav: - get_trt_engine_inputs_and_outputs: api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md - infer_from_trt_engine: api-reference/developer-tools/trt/infer-from-trt-engine.md - load_trt_model: api-reference/developer-tools/trt/load-trt-model.md + - establish_trt_cuda_graph_cache: api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md + - TRTCudaGraphCache: api-reference/developer-tools/trt/trt-cuda-graph-cache.md - Entities: - RuntimeXRayResult: api-reference/developer-tools/runtime-xray-result.md - ModelMetadata: api-reference/developer-tools/model-metadata.md diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py index 3c3853987f..1648beac82 100644 --- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py @@ -558,7 +558,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes( assert cache_size_after == cache_size_before assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6) - assert set(trt_cuda_graph_cache.cache.keys()) == seen_shapes + assert set(trt_cuda_graph_cache.list_keys()) == seen_shapes @pytest.mark.slow From f9aeec80a496d2c746b0c32de162fc48a8c29cb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:26:40 +0100 Subject: [PATCH 45/50] Fix GH workflow --- .../workflows/integration_tests_inference_experimental_gpu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml index 60fbde84a2..328b583061 100644 --- a/.github/workflows/integration_tests_inference_experimental_gpu.yml +++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml @@ -15,7 +15,6 @@ on: - '' - onnx_extras - trt_extras - - trt_extras_with_cuda_graphs - torch_models - hf_vlm_models python_version: From 3e6dd5cb14b039b128a676db34e5d03843e86ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:47:33 +0100 Subject: [PATCH 46/50] Enforce replay after cuda graph is recorded to get actual results --- inference_models/inference_models/models/common/trt.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 982f9644e5..3f70639923 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -811,6 +811,12 @@ def _capture_cuda_graph( results = [buf.clone() for buf in output_buffers] stream.synchronize() + # in order to avoid drift of results - it's better to replay to get the results + with torch.cuda.stream(stream): + cuda_graph.replay() + results = [buf.clone() for buf in output_buffers] + stream.synchronize() + trt_cuda_graph_state = TRTCudaGraphState( cuda_graph=cuda_graph, cuda_stream=stream, From ba4f5f246e9683a1031c40b5bae01077ada151f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 19:57:42 +0100 Subject: [PATCH 47/50] Alter YOLONAS tests to ensure repeatable predictions with warmup --- .../models/test_yolonas_predictions_trt.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py index ce7a2c2a2e..3ee3ab4535 100644 --- a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py +++ b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py @@ -20,6 +20,9 @@ def test_trt_package_numpy( ) # when + # warmup + for _ in range(5): + _ = model(coins_counting_image_numpy) predictions = model(coins_counting_image_numpy) # then @@ -88,6 +91,9 @@ def test_trt_package_batch_numpy( ) # when + # warmup + for _ in range(5): + _ = model([coins_counting_image_numpy, coins_counting_image_numpy]) predictions = model([coins_counting_image_numpy, coins_counting_image_numpy]) # then @@ -202,6 +208,9 @@ def test_trt_package_torch( ) # when + # warmup + for _ in range(5): + _ = model(coins_counting_image_torch) predictions = model(coins_counting_image_torch) # then @@ -270,6 +279,9 @@ def test_trt_package_torch_multiple_predictions_in_row( ) # when + # warmup + for _ in range(5): + _ = model(coins_counting_image_torch) for _ in range(8): predictions = model(coins_counting_image_torch) @@ -339,6 +351,9 @@ def test_trt_package_torch_list( ) # when + # warmup + for _ in range(5): + _ = model([coins_counting_image_torch, coins_counting_image_torch]) predictions = model([coins_counting_image_torch, coins_counting_image_torch]) # then @@ -453,6 +468,9 @@ def test_trt_package_torch_batch( ) # when + # warmup + for _ in range(5): + _ = model(torch.stack([coins_counting_image_torch, coins_counting_image_torch], dim=0)) predictions = model( torch.stack([coins_counting_image_torch, coins_counting_image_torch], dim=0) ) From d10ecfb73b31f6a2df933c9cd2e34ca6e6d6d245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 20:37:21 +0100 Subject: [PATCH 48/50] Fix imports in docscrings --- inference_models/inference_models/models/common/trt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py index 3f70639923..4a07f4c144 100644 --- a/inference_models/inference_models/models/common/trt.py +++ b/inference_models/inference_models/models/common/trt.py @@ -99,7 +99,7 @@ class TRTCudaGraphCache: Examples: Create a cache and pass it to a model: - >>> from inference_models.models.common.trt import TRTCudaGraphCache + >>> from inference_models.developer_tools import TRTCudaGraphCache >>> from inference_models import AutoModel >>> import torch >>> @@ -311,7 +311,7 @@ def establish_trt_cuda_graph_cache( >>> import os >>> os.environ["ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND"] = "True" >>> - >>> from inference_models.models.common.trt import ( + >>> from inference_models.developer_tools import ( ... establish_trt_cuda_graph_cache, ... ) >>> From 4e89392d0dad4d5d21e587755d8d8f7b07c8fa71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 20:48:15 +0100 Subject: [PATCH 49/50] Bump version --- inference_models/pyproject.toml | 2 +- inference_models/uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inference_models/pyproject.toml b/inference_models/pyproject.toml index e90907440c..b601421715 100644 --- a/inference_models/pyproject.toml +++ b/inference_models/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "inference-models" -version = "0.21.0rc1" +version = "0.21.0" description = "The new inference engine for Computer Vision models" readme = "README.md" requires-python = ">=3.10,<3.13" diff --git a/inference_models/uv.lock b/inference_models/uv.lock index 3470775db6..4708931782 100644 --- a/inference_models/uv.lock +++ b/inference_models/uv.lock @@ -916,7 +916,7 @@ wheels = [ [[package]] name = "inference-models" -version = "0.21.0rc1" +version = "0.21.0" source = { virtual = "." } dependencies = [ { name = "accelerate" }, From 1d4e9614fa166692b026e640eed177a2854ab0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= Date: Wed, 18 Mar 2026 20:49:54 +0100 Subject: [PATCH 50/50] Bump version of inference-models in inference requirements --- requirements/_requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt index 6c47cf8365..18df5cb1cf 100644 --- a/requirements/_requirements.txt +++ b/requirements/_requirements.txt @@ -50,4 +50,4 @@ filelock>=3.12.0,<=3.17.0 onvif-zeep-async==2.0.0 # versions > 2.0.0 will not work with Python 3.9 despite docs simple-pid~=2.0.1 qrcode~=8.0.0 -inference-models~=0.20.2 +inference-models~=0.21.0