From 6f6c44e07bd0eecc401ce14eccf64234681fd440 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 01:14:41 +0000
Subject: [PATCH 01/50] pass TRT graph state up and dwon call stack and acache
 in RFDetrObjDetTRT class

---
 .../inference_models/models/common/trt.py     |  9 +++++
 .../rfdetr/rfdetr_object_detection_trt.py     | 35 +++++++++++++------
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index d692f567a9..3b86dfef86 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -1,4 +1,5 @@
 from typing import List, Tuple
+from dataclasses import dataclass
 
 import torch
 
@@ -57,6 +58,14 @@ def log(self, severity: trt.ILogger.Severity, msg: str) -> None:
     def get_memory(self) -> List[Tuple[trt.ILogger.Severity, str]]:
         return self._memory
 
+import pycuda.driver as cuda
+@dataclass
+class TRTCudaGraphState:
+    cuda_graph: cuda.GraphExec
+    cuda_stream: torch.cuda.Stream
+    input_pointer: int
+    output_pointers: List[int]
+
 
 def get_trt_engine_inputs_and_outputs(
     engine: trt.ICudaEngine,
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index b3833fc4f8..a250bed25f 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -36,6 +36,7 @@
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
+    TRTCudaGraphState,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -197,19 +198,33 @@ def pre_process(
         )
 
     def forward(
-        self, pre_processed_images: torch.Tensor, **kwargs
+        self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                detections, labels = infer_from_trt_engine(
-                    pre_processed_images=pre_processed_images,
-                    trt_config=self._trt_config,
-                    engine=self._engine,
-                    context=self._execution_context,
-                    device=self._device,
-                    input_name=self._input_name,
-                    outputs=self._output_names,
-                )
+                if use_cuda_graph:
+                    detections, labels, trt_cuda_graph_state = infer_from_trt_engine(
+                        pre_processed_images=pre_processed_images,
+                        trt_config=self._trt_config,
+                        engine=self._engine,
+                        context=self._execution_context,
+                        device=self._device,
+                        input_name=self._input_name,
+                        outputs=self._output_names,
+                        use_cuda_graph=True,
+                    )
+                    self._trt_cuda_graph_state = trt_cuda_graph_state
+                else:
+                    detections, labels = infer_from_trt_engine(
+                        pre_processed_images=pre_processed_images,
+                        trt_config=self._trt_config,
+                        engine=self._engine,
+                        context=self._execution_context,
+                        device=self._device,
+                        input_name=self._input_name,
+                        outputs=self._output_names,
+                        use_cuda_graph=False,
+                    )
                 return detections, labels
 
     def post_process(

From 549ca10bc2789424de5671bd0b2e0f24a04e1aa1 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 01:34:53 +0000
Subject: [PATCH 02/50] actually passing it up and down the stack

---
 .../inference_models/models/common/trt.py     | 118 ++++++++++++++++--
 .../rfdetr/rfdetr_object_detection_trt.py     |   4 +-
 2 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 3b86dfef86..fead8056af 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 from dataclasses import dataclass
 
 import torch
@@ -64,8 +64,12 @@ class TRTCudaGraphState:
     cuda_graph: cuda.GraphExec
     cuda_stream: torch.cuda.Stream
     input_pointer: int
+    input_shape: Tuple[int, ...]
     output_pointers: List[int]
+    output_shapes: List[Tuple[int, ...]]
 
+    def has_changed_shape(self, input_shape: Tuple[int, ...], output_shapes: List[Tuple[int, ...]]) -> bool:
+        return self.input_shape != input_shape or self.output_shapes != output_shapes
 
 def get_trt_engine_inputs_and_outputs(
     engine: trt.ICudaEngine,
@@ -137,7 +141,9 @@ def infer_from_trt_engine(
     device: torch.device,
     input_name: str,
     outputs: List[str],
-) -> List[torch.Tensor]:
+    use_cuda_graph: bool = False,
+    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor], TRTCudaGraphState]:
     """Run inference using a TensorRT engine.
 
     Executes inference on preprocessed images using a TensorRT engine and execution
@@ -239,6 +245,8 @@ def infer_from_trt_engine(
             outputs=outputs,
             min_batch_size=trt_config.static_batch_size,
             max_batch_size=trt_config.static_batch_size,
+            use_cuda_graph=use_cuda_graph,
+            trt_cuda_graph_state=trt_cuda_graph_state,
         )
     return infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
@@ -249,6 +257,8 @@ def infer_from_trt_engine(
         outputs=outputs,
         min_batch_size=trt_config.dynamic_batch_size_min,
         max_batch_size=trt_config.dynamic_batch_size_max,
+        use_cuda_graph=use_cuda_graph,
+        trt_cuda_graph_state=trt_cuda_graph_state,
     )
 
 
@@ -261,7 +271,9 @@ def infer_from_trt_engine_with_batch_size_boundaries(
     outputs: List[str],
     min_batch_size: int,
     max_batch_size: int,
-) -> List[torch.Tensor]:
+    use_cuda_graph: bool = False,
+    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
+) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
     if pre_processed_images.shape[0] <= max_batch_size:
         reminder = min_batch_size - pre_processed_images.shape[0]
         if reminder > 0:
@@ -276,17 +288,19 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results = execute_trt_engine(
+        results, trt_cuda_graph_state = execute_trt_engine(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
             device=device,
             input_name=input_name,
             outputs=outputs,
+            use_cuda_graph=use_cuda_graph,
+            trt_cuda_graph_state=trt_cuda_graph_state,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
-        return results
+        return results, trt_cuda_graph_state
     all_results = []
     for _ in outputs:
         all_results.append([])
@@ -305,19 +319,21 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results = execute_trt_engine(
+        results, trt_cuda_graph_state = execute_trt_engine(
             pre_processed_images=batch,
             engine=engine,
             context=context,
             device=device,
             input_name=input_name,
             outputs=outputs,
+            use_cuda_graph=use_cuda_graph,
+            trt_cuda_graph_state=trt_cuda_graph_state,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
         for partial_result, all_result_element in zip(results, all_results):
             all_result_element.append(partial_result)
-    return [torch.cat(e, dim=0).contiguous() for e in all_results]
+    return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_state
 
 
 def execute_trt_engine(
@@ -327,7 +343,91 @@ def execute_trt_engine(
     device: torch.device,
     input_name: str,
     outputs: List[str],
-) -> List[torch.Tensor]:
+    use_cuda_graph: bool = False,
+    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+
+    if use_cuda_graph:
+
+        batch_size = pre_processed_images.shape[0]
+        results = []
+        for output in outputs:
+            output_tensor_shape = engine.get_tensor_shape(output)
+            output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
+            result = torch.empty(
+                (batch_size,) + output_tensor_shape[1:],
+                dtype=output_tensor_type,
+                device=device,
+            )
+            context.set_tensor_address(output, result.data_ptr())
+            results.append(result)
+        status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to set TRT model input shape during forward pass from the model.",
+                help_url="https://todo",
+            )
+        status = context.set_tensor_address(input_name, pre_processed_images.data_ptr())
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to set input tensor data pointer during forward pass from the model.",
+                help_url="https://todo",
+            )
+        stream = torch.cuda.Stream(device=device)
+        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to complete inference from TRT model",
+                help_url="https://todo",
+            )
+        stream.synchronize()
+        return results, None  
+
+    else:
+
+        batch_size = pre_processed_images.shape[0]
+        results = []
+        for output in outputs:
+            output_tensor_shape = engine.get_tensor_shape(output)
+            output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
+            result = torch.empty(
+                (batch_size,) + output_tensor_shape[1:],
+                dtype=output_tensor_type,
+                device=device,
+            )
+            context.set_tensor_address(output, result.data_ptr())
+            results.append(result)
+        status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to set TRT model input shape during forward pass from the model.",
+                help_url="https://todo",
+            )
+        status = context.set_tensor_address(input_name, pre_processed_images.data_ptr())
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to set input tensor data pointer during forward pass from the model.",
+                help_url="https://todo",
+            )
+        stream = torch.cuda.Stream(device=device)
+        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to complete inference from TRT model",
+                help_url="https://todo",
+            )
+        stream.synchronize()
+        return results, None
+
+
+def execute_trt_engine_with_cuda_graph(
+    pre_processed_images: torch.Tensor,
+    engine: trt.ICudaEngine,
+    context: trt.IExecutionContext,
+    device: torch.device,
+    input_name: str,
+    outputs: List[str],
+) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
     batch_size = pre_processed_images.shape[0]
     results = []
     for output in outputs:
@@ -360,7 +460,7 @@ def execute_trt_engine(
             help_url="https://todo",
         )
     stream.synchronize()
-    return results
+    return results, None
 
 
 def trt_dtype_to_torch(trt_dtype):
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index a250bed25f..469ea0fd3c 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -177,6 +177,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
+        self._trt_cuda_graph_state = None
         self._lock = threading.Lock()
 
     @property
@@ -212,10 +213,11 @@ def forward(
                         input_name=self._input_name,
                         outputs=self._output_names,
                         use_cuda_graph=True,
+                        trt_cuda_graph_state=self._trt_cuda_graph_state,
                     )
                     self._trt_cuda_graph_state = trt_cuda_graph_state
                 else:
-                    detections, labels = infer_from_trt_engine(
+                    detections, labels, _ = infer_from_trt_engine(
                         pre_processed_images=pre_processed_images,
                         trt_config=self._trt_config,
                         engine=self._engine,

From 6412efef3c55b22b743498cce77da494a63d464d Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 02:14:29 +0000
Subject: [PATCH 03/50] three-branch solution

---
 .../profile_rfdetr_trt_cudagraphs.py          |  70 +++++++++
 .../inference_models/models/common/trt.py     | 135 ++++++++++--------
 .../rfdetr/rfdetr_object_detection_trt.py     |   4 +-
 3 files changed, 150 insertions(+), 59 deletions(-)
 create mode 100644 inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
new file mode 100644
index 0000000000..89fce9a0da
--- /dev/null
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -0,0 +1,70 @@
+import os
+import time
+
+import cv2
+import torch
+from tqdm import tqdm
+
+from inference_models import AutoModel
+
+IMAGE_PATH_WARMUP = "/home/mkaic/inference/tests/inference/unit_tests/core/utils/assets/1.jpg"
+# IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP
+IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg"
+DEVICE = os.environ.get("DEVICE", "cuda:0")
+CYCLES = 500
+WARMUP = 50
+
+
+def main() -> None:
+    image = cv2.imread(IMAGE_PATH_WARMUP)
+    model = AutoModel.from_pretrained(
+        model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
+    )
+
+    image_warmup = cv2.imread(IMAGE_PATH_WARMUP)
+    pre_processed_warmup, metadata = model.pre_process(image_warmup)
+    print(f"Pre-processed image shape: {pre_processed_warmup.shape}")
+
+    print(f"Warming up ({WARMUP} iterations each)...")
+    for _ in range(WARMUP):
+        model.forward(pre_processed_warmup, use_cuda_graph=False)
+        model.forward(pre_processed_warmup, use_cuda_graph=True)
+    # torch.cuda.synchronize()
+
+    print(f"Profiling ({CYCLES} iterations each)...")
+    image_profiling = cv2.imread(IMAGE_PATH_PROFILING)
+    pre_processed_profiling, metadata = model.pre_process(image_profiling)
+    print(f"Pre-processed image shape: {pre_processed_profiling.shape}")
+
+    start = time.perf_counter()
+    for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"):
+        model.forward(pre_processed_profiling, use_cuda_graph=False)
+    # torch.cuda.synchronize()
+    baseline_fps = CYCLES / (time.perf_counter() - start)
+
+    start = time.perf_counter()
+    for _ in tqdm(range(CYCLES), desc="With CUDA graphs"):
+        model.forward(pre_processed_profiling, use_cuda_graph=True)
+    # torch.cuda.synchronize()
+    cudagraph_fps = CYCLES / (time.perf_counter() - start)
+
+    result_baseline = model.forward(pre_processed_profiling, use_cuda_graph=False)
+    result_cudagraph = model.forward(pre_processed_profiling, use_cuda_graph=True)
+    # torch.cuda.synchronize()
+
+    print(f"Result baseline: {result_baseline}")
+    print(f"Result cudagraph: {result_cudagraph}")
+
+    dets_match = torch.allclose(result_baseline[0], result_cudagraph[0], atol=1e-4)
+    labels_match = torch.allclose(result_baseline[1], result_cudagraph[1], atol=1e-4)
+
+    print(f"\n{'='*50}")
+    print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
+    print(f"Forward pass FPS (CUDA graphs):    {cudagraph_fps:.1f}")
+    print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x")
+    print(f"Outputs match: dets={dets_match}, labels={labels_match}")
+    print(f"{'='*50}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index fead8056af..96924da147 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -58,18 +58,17 @@ def log(self, severity: trt.ILogger.Severity, msg: str) -> None:
     def get_memory(self) -> List[Tuple[trt.ILogger.Severity, str]]:
         return self._memory
 
-import pycuda.driver as cuda
+
 @dataclass
 class TRTCudaGraphState:
-    cuda_graph: cuda.GraphExec
+    cuda_graph: torch.cuda.CUDAGraph
     cuda_stream: torch.cuda.Stream
-    input_pointer: int
-    input_shape: Tuple[int, ...]
-    output_pointers: List[int]
-    output_shapes: List[Tuple[int, ...]]
+    input_buffer: torch.Tensor
+    output_buffers: List[torch.Tensor]
+
+    def has_changed_shape(self, input_shape: Tuple[int, ...]) -> bool:
+        return tuple(self.input_buffer.shape) != input_shape
 
-    def has_changed_shape(self, input_shape: Tuple[int, ...], output_shapes: List[Tuple[int, ...]]) -> bool:
-        return self.input_shape != input_shape or self.output_shapes != output_shapes
 
 def get_trt_engine_inputs_and_outputs(
     engine: trt.ICudaEngine,
@@ -143,7 +142,7 @@ def infer_from_trt_engine(
     outputs: List[str],
     use_cuda_graph: bool = False,
     trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor], TRTCudaGraphState]:
+) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
     """Run inference using a TensorRT engine.
 
     Executes inference on preprocessed images using a TensorRT engine and execution
@@ -347,44 +346,40 @@ def execute_trt_engine(
     trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
 ) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
 
-    if use_cuda_graph:
-
-        batch_size = pre_processed_images.shape[0]
-        results = []
-        for output in outputs:
-            output_tensor_shape = engine.get_tensor_shape(output)
-            output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
-            result = torch.empty(
-                (batch_size,) + output_tensor_shape[1:],
-                dtype=output_tensor_type,
-                device=device,
-            )
-            context.set_tensor_address(output, result.data_ptr())
-            results.append(result)
-        status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
-        if not status:
-            raise ModelRuntimeError(
-                message="Failed to set TRT model input shape during forward pass from the model.",
-                help_url="https://todo",
+    if trt_cuda_graph_state is not None:
+        input_shape = tuple(pre_processed_images.shape)
+        if trt_cuda_graph_state.has_changed_shape(input_shape):
+            LOGGER.warning(
+                f"Input shape changed from {tuple(trt_cuda_graph_state.input_buffer.shape)} "
+                f"to {input_shape}. Recapturing CUDA graph."
             )
-        status = context.set_tensor_address(input_name, pre_processed_images.data_ptr())
-        if not status:
-            raise ModelRuntimeError(
-                message="Failed to set input tensor data pointer during forward pass from the model.",
-                help_url="https://todo",
-            )
-        stream = torch.cuda.Stream(device=device)
-        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
-        if not status:
-            raise ModelRuntimeError(
-                message="Failed to complete inference from TRT model",
-                help_url="https://todo",
+            return _capture_cuda_graph(
+                pre_processed_images=pre_processed_images,
+                engine=engine,
+                context=context,
+                device=device,
+                input_name=input_name,
+                outputs=outputs,
             )
+
+        stream = trt_cuda_graph_state.cuda_stream
+        trt_cuda_graph_state.input_buffer.copy_(pre_processed_images)
+        trt_cuda_graph_state.cuda_graph.replay()
         stream.synchronize()
-        return results, None  
+        results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
+        return results, trt_cuda_graph_state
 
-    else:
+    elif use_cuda_graph:
+        return _capture_cuda_graph(
+            pre_processed_images=pre_processed_images,
+            engine=engine,
+            context=context,
+            device=device,
+            input_name=input_name,
+            outputs=outputs,
+        )
 
+    else:
         batch_size = pre_processed_images.shape[0]
         results = []
         for output in outputs:
@@ -420,7 +415,7 @@ def execute_trt_engine(
         return results, None
 
 
-def execute_trt_engine_with_cuda_graph(
+def _capture_cuda_graph(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
     context: trt.IExecutionContext,
@@ -429,38 +424,64 @@ def execute_trt_engine_with_cuda_graph(
     outputs: List[str],
 ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
     batch_size = pre_processed_images.shape[0]
-    results = []
+
+    input_buffer = torch.empty_like(pre_processed_images, device=device)
+    input_buffer.copy_(pre_processed_images)
+
+    output_buffers = []
     for output in outputs:
         output_tensor_shape = engine.get_tensor_shape(output)
         output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
-        result = torch.empty(
+        output_buffer = torch.empty(
             (batch_size,) + output_tensor_shape[1:],
             dtype=output_tensor_type,
             device=device,
         )
-        context.set_tensor_address(output, result.data_ptr())
-        results.append(result)
+        context.set_tensor_address(output, output_buffer.data_ptr())
+        output_buffers.append(output_buffer)
+
     status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
     if not status:
         raise ModelRuntimeError(
-            message="Failed to set TRT model input shape during forward pass from the model.",
+            message="Failed to set TRT model input shape during CUDA graph capture.",
             help_url="https://todo",
         )
-    status = context.set_tensor_address(input_name, pre_processed_images.data_ptr())
+    status = context.set_tensor_address(input_name, input_buffer.data_ptr())
     if not status:
         raise ModelRuntimeError(
-            message="Failed to set input tensor data pointer during forward pass from the model.",
+            message="Failed to set input tensor data pointer during CUDA graph capture.",
             help_url="https://todo",
         )
+
     stream = torch.cuda.Stream(device=device)
-    status = context.execute_async_v3(stream_handle=stream.cuda_stream)
-    if not status:
-        raise ModelRuntimeError(
-            message="Failed to complete inference from TRT model",
-            help_url="https://todo",
-        )
+    with torch.cuda.stream(stream):
+        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to execute TRT model warmup before CUDA graph capture.",
+                help_url="https://todo",
+            )
     stream.synchronize()
-    return results, None
+
+    cuda_graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(cuda_graph, stream=stream):
+        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        if not status:
+            raise ModelRuntimeError(
+                message="Failed to capture CUDA graph from TRT model execution.",
+                help_url="https://todo",
+            )
+
+    
+    trt_cuda_graph_state = TRTCudaGraphState(
+        cuda_graph=cuda_graph,
+        cuda_stream=stream,
+        input_buffer=input_buffer,
+        output_buffers=output_buffers,
+    )
+
+    results = [buf.clone() for buf in output_buffers]
+    return results, trt_cuda_graph_state
 
 
 def trt_dtype_to_torch(trt_dtype):
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index 469ea0fd3c..ec5dc4cd21 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -204,7 +204,7 @@ def forward(
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 if use_cuda_graph:
-                    detections, labels, trt_cuda_graph_state = infer_from_trt_engine(
+                    (detections, labels), trt_cuda_graph_state = infer_from_trt_engine(
                         pre_processed_images=pre_processed_images,
                         trt_config=self._trt_config,
                         engine=self._engine,
@@ -217,7 +217,7 @@ def forward(
                     )
                     self._trt_cuda_graph_state = trt_cuda_graph_state
                 else:
-                    detections, labels, _ = infer_from_trt_engine(
+                    (detections, labels), _ = infer_from_trt_engine(
                         pre_processed_images=pre_processed_images,
                         trt_config=self._trt_config,
                         engine=self._engine,

From 08888e396682c7e24e29b007c1355f670c6b2c5b Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 04:00:34 +0000
Subject: [PATCH 04/50] avoid breaking things due to chagne in
 infer_with_trt_engine API

---
 .../inference_models/models/common/trt.py     | 17 ++++++----
 .../rfdetr_instance_segmentation_trt.py       |  3 +-
 .../rfdetr/rfdetr_object_detection_trt.py     | 33 +++++++------------
 .../yolact_instance_segmentation_trt.py       |  2 +-
 .../yolonas/yolonas_object_detection_trt.py   |  2 +-
 .../yolov5_instance_segmentation_trt.py       |  2 +-
 .../yolov7_instance_segmentation_trt.py       |  2 +-
 .../yolov8_instance_segmentation_trt.py       |  2 +-
 8 files changed, 28 insertions(+), 35 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 96924da147..19767e89c0 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -142,7 +142,7 @@ def infer_from_trt_engine(
     outputs: List[str],
     use_cuda_graph: bool = False,
     trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
     """Run inference using a TensorRT engine.
 
     Executes inference on preprocessed images using a TensorRT engine and execution
@@ -272,7 +272,7 @@ def infer_from_trt_engine_with_batch_size_boundaries(
     max_batch_size: int,
     use_cuda_graph: bool = False,
     trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
     if pre_processed_images.shape[0] <= max_batch_size:
         reminder = min_batch_size - pre_processed_images.shape[0]
         if reminder > 0:
@@ -363,10 +363,12 @@ def execute_trt_engine(
             )
 
         stream = trt_cuda_graph_state.cuda_stream
-        trt_cuda_graph_state.input_buffer.copy_(pre_processed_images)
-        trt_cuda_graph_state.cuda_graph.replay()
+        with torch.cuda.stream(stream):
+            trt_cuda_graph_state.input_buffer.copy_(pre_processed_images)
+            trt_cuda_graph_state.cuda_graph.replay()
+            results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
         stream.synchronize()
-        results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
+
         return results, trt_cuda_graph_state
 
     elif use_cuda_graph:
@@ -471,8 +473,10 @@ def _capture_cuda_graph(
                 message="Failed to capture CUDA graph from TRT model execution.",
                 help_url="https://todo",
             )
+    with torch.cuda.stream(stream):
+        results = [buf.clone() for buf in output_buffers]
+    stream.synchronize()
 
-    
     trt_cuda_graph_state = TRTCudaGraphState(
         cuda_graph=cuda_graph,
         cuda_stream=stream,
@@ -480,7 +484,6 @@ def _capture_cuda_graph(
         output_buffers=output_buffers,
     )
 
-    results = [buf.clone() for buf in output_buffers]
     return results, trt_cuda_graph_state
 
 
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index c4e9223023..78e2be9d50 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -33,6 +33,7 @@
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
+    TRTCudaGraphState,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -198,7 +199,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                detections, labels, masks = infer_from_trt_engine(
+                (detections, labels, masks), _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index ec5dc4cd21..b65d03348d 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -203,30 +203,19 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
+                (detections, labels), trt_cuda_graph_state = infer_from_trt_engine(
+                    pre_processed_images=pre_processed_images,
+                    trt_config=self._trt_config,
+                    engine=self._engine,
+                    context=self._execution_context,
+                    device=self._device,
+                    input_name=self._input_name,
+                    outputs=self._output_names,
+                    use_cuda_graph=use_cuda_graph,
+                    trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None,
+                )
                 if use_cuda_graph:
-                    (detections, labels), trt_cuda_graph_state = infer_from_trt_engine(
-                        pre_processed_images=pre_processed_images,
-                        trt_config=self._trt_config,
-                        engine=self._engine,
-                        context=self._execution_context,
-                        device=self._device,
-                        input_name=self._input_name,
-                        outputs=self._output_names,
-                        use_cuda_graph=True,
-                        trt_cuda_graph_state=self._trt_cuda_graph_state,
-                    )
                     self._trt_cuda_graph_state = trt_cuda_graph_state
-                else:
-                    (detections, labels), _ = infer_from_trt_engine(
-                        pre_processed_images=pre_processed_images,
-                        trt_config=self._trt_config,
-                        engine=self._engine,
-                        context=self._execution_context,
-                        device=self._device,
-                        input_name=self._input_name,
-                        outputs=self._output_names,
-                        use_cuda_graph=False,
-                    )
                 return detections, labels
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
index cc3cbeedaf..ea6ebe6cf0 100644
--- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
@@ -192,7 +192,7 @@ def forward(
                     all_proto_data,
                 ) = ([], [], [], [], [])
                 for image in pre_processed_images:
-                    loc_data, conf_data, mask_data, prior_data, proto_data = (
+                    (loc_data, conf_data, mask_data, prior_data, proto_data), _ = (
                         infer_from_trt_engine(
                             pre_processed_images=image.unsqueeze(0).contiguous(),
                             trt_config=self._trt_config,
diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
index 6f561d58e4..39822ff34b 100644
--- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
@@ -187,7 +187,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                results = infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
index ee7180e10d..a18b743b90 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
@@ -187,7 +187,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                instances, protos = infer_from_trt_engine(
+                (instances, protos), _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
index 15ef8a13ee..abcc82a78c 100644
--- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                instances, protos = infer_from_trt_engine(
+                (instances, protos), _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
index 8a32a117ae..f2cf1d7953 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
@@ -195,7 +195,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                instances, protos = infer_from_trt_engine(
+                (instances, protos), _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,

From adda4aa825d8d5443dac365357e251b7b6d5776f Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 04:33:48 +0000
Subject: [PATCH 05/50] update unpacking in the rest of the TRT.py files

---
 .../profile_rfdetr_trt_cudagraphs.py          | 38 +++++++++++++------
 .../deep_lab_v3_plus_segmentation_trt.py      |  5 ++-
 .../resnet/resnet_classification_trt.py       | 10 +++--
 .../models/vit/vit_classification_trt.py      | 10 +++--
 .../yolov10/yolov10_object_detection_trt.py   |  5 ++-
 .../yolov5/yolov5_object_detection_trt.py     |  5 ++-
 .../yolov8/yolov8_key_points_detection_trt.py |  5 ++-
 .../yolov8/yolov8_object_detection_trt.py     |  5 ++-
 8 files changed, 53 insertions(+), 30 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 89fce9a0da..27218b0c08 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -11,14 +11,14 @@
 # IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP
 IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg"
 DEVICE = os.environ.get("DEVICE", "cuda:0")
-CYCLES = 500
+CYCLES = 10_000
 WARMUP = 50
 
 
 def main() -> None:
     image = cv2.imread(IMAGE_PATH_WARMUP)
     model = AutoModel.from_pretrained(
-        model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
+        model_id_or_path="rfdetr-seg-preview", device=torch.device(DEVICE), backend="trt"
     )
 
     image_warmup = cv2.imread(IMAGE_PATH_WARMUP)
@@ -48,21 +48,35 @@ def main() -> None:
     # torch.cuda.synchronize()
     cudagraph_fps = CYCLES / (time.perf_counter() - start)
 
-    result_baseline = model.forward(pre_processed_profiling, use_cuda_graph=False)
-    result_cudagraph = model.forward(pre_processed_profiling, use_cuda_graph=True)
-    # torch.cuda.synchronize()
-
-    print(f"Result baseline: {result_baseline}")
-    print(f"Result cudagraph: {result_cudagraph}")
-
-    dets_match = torch.allclose(result_baseline[0], result_cudagraph[0], atol=1e-4)
-    labels_match = torch.allclose(result_baseline[1], result_cudagraph[1], atol=1e-4)
+    expected_warmup = model.forward(pre_processed_warmup, use_cuda_graph=False)
+    expected_profiling = model.forward(pre_processed_profiling, use_cuda_graph=False)
+
+    print("Testing for race conditions (alternating inputs 20 times)...")
+    all_match = True
+    for i in range(20):
+        if i % 2 == 0:
+            result = model.forward(pre_processed_warmup, use_cuda_graph=True)
+            expected = expected_warmup
+            img_name = "warmup"
+        else:
+            result = model.forward(pre_processed_profiling, use_cuda_graph=True)
+            expected = expected_profiling
+            img_name = "profiling"
+
+        dets_match = torch.allclose(result[0], expected[0], atol=1e-6)
+        labels_match = torch.allclose(result[1], expected[1], atol=1e-6)
+        if not (dets_match and labels_match):
+            print(f"  MISMATCH at iteration {i} ({img_name}): dets={dets_match}, labels={labels_match}")
+            all_match = False
+
+    if all_match:
+        print("  All 20 iterations matched expected outputs.")
 
     print(f"\n{'='*50}")
     print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
     print(f"Forward pass FPS (CUDA graphs):    {cudagraph_fps:.1f}")
     print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x")
-    print(f"Outputs match: dets={dets_match}, labels={labels_match}")
+    print(f"Race condition test: {'PASSED' if all_match else 'FAILED'}")
     print(f"{'='*50}")
 
 
diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
index 9976a86761..050908b1fe 100644
--- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
+++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -193,7 +193,8 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
index 34de7058e3..e0b2621a55 100644
--- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py
+++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -193,7 +193,8 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
@@ -335,7 +336,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -343,7 +344,8 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py
index d04a90607e..3a0892a8c5 100644
--- a/inference_models/inference_models/models/vit/vit_classification_trt.py
+++ b/inference_models/inference_models/models/vit/vit_classification_trt.py
@@ -183,7 +183,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -191,7 +191,8 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
@@ -331,7 +332,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -339,7 +340,8 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
index ec2dcf3cdb..ff25e019c2 100644
--- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
@@ -177,7 +177,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -185,7 +185,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
index ff7d376e07..a423033cba 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
@@ -175,7 +175,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -183,7 +183,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
index 898beebb04..12e8630c1f 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
@@ -210,7 +210,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -218,7 +218,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index aef07c3fad..29b9d7bfe8 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -183,7 +183,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
+                results, _ = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -191,7 +191,8 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )[0]
+                )
+                return results[0]
 
     def post_process(
         self,

From 97fdcf0e54d4541e640110e6fbbd024a7296bfc6 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 04:51:47 +0000
Subject: [PATCH 06/50] clean up profiling script

---
 .../profile_rfdetr_trt_cudagraphs.py          | 92 ++++++++++---------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 27218b0c08..c60ecea6c5 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -2,81 +2,87 @@
 import time
 
 import cv2
+import numpy as np
 import torch
 from tqdm import tqdm
 
 from inference_models import AutoModel
 
-IMAGE_PATH_WARMUP = "/home/mkaic/inference/tests/inference/unit_tests/core/utils/assets/1.jpg"
-# IMAGE_PATH_PROFILING = IMAGE_PATH_WARMUP
-IMAGE_PATH_PROFILING = "/home/mkaic/inference/tests/workflows/integration_tests/execution/assets/car.jpg"
+IMAGE_PATH_WARMUP = os.environ.get("IMAGE_PATH_WARMUP", None)
+IMAGE_PATH_PROFILING = os.environ.get("IMAGE_PATH_PROFILING", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
-CYCLES = 10_000
-WARMUP = 50
+CYCLES = int(os.environ.get("CYCLES", "10_000"))
+WARMUP = int(os.environ.get("WARMUP", "50"))
 
 
 def main() -> None:
-    image = cv2.imread(IMAGE_PATH_WARMUP)
+
     model = AutoModel.from_pretrained(
-        model_id_or_path="rfdetr-seg-preview", device=torch.device(DEVICE), backend="trt"
+        model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
     )
 
-    image_warmup = cv2.imread(IMAGE_PATH_WARMUP)
-    pre_processed_warmup, metadata = model.pre_process(image_warmup)
-    print(f"Pre-processed image shape: {pre_processed_warmup.shape}")
+    if IMAGE_PATH_WARMUP is not None:
+        image_warmup = cv2.imread(IMAGE_PATH_WARMUP)
+    else:
+        image_warmup = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+
+    if IMAGE_PATH_PROFILING is not None:
+        image_profiling = cv2.imread(IMAGE_PATH_PROFILING)
+    else:
+        image_profiling = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+
+    pre_processed_warmup, _ = model.pre_process(image_warmup)
+    pre_processed_profiling, _ = model.pre_process(image_profiling)
 
-    print(f"Warming up ({WARMUP} iterations each)...")
     for _ in range(WARMUP):
         model.forward(pre_processed_warmup, use_cuda_graph=False)
         model.forward(pre_processed_warmup, use_cuda_graph=True)
-    # torch.cuda.synchronize()
 
-    print(f"Profiling ({CYCLES} iterations each)...")
-    image_profiling = cv2.imread(IMAGE_PATH_PROFILING)
-    pre_processed_profiling, metadata = model.pre_process(image_profiling)
-    print(f"Pre-processed image shape: {pre_processed_profiling.shape}")
+    expected_output_warmup_image = model.forward(
+        pre_processed_warmup, use_cuda_graph=False
+    )
+    expected_output_profiling_image = model.forward(
+        pre_processed_profiling, use_cuda_graph=False
+    )
+
+    cudagraph_output_warmup_image = model.forward(
+        pre_processed_warmup, use_cuda_graph=True
+    )
+    cudagraph_output_profiling_image = model.forward(
+        pre_processed_profiling, use_cuda_graph=True
+    )
+
+    assert torch.allclose(
+        expected_output_warmup_image[0], cudagraph_output_warmup_image[0], atol=1e-6
+    )
+    assert torch.allclose(
+        expected_output_profiling_image[0],
+        cudagraph_output_profiling_image[0],
+        atol=1e-6,
+    )
+    assert torch.allclose(
+        expected_output_warmup_image[1], cudagraph_output_warmup_image[1], atol=1e-6
+    )
+    assert torch.allclose(
+        expected_output_profiling_image[1],
+        cudagraph_output_profiling_image[1],
+        atol=1e-6,
+    )
 
     start = time.perf_counter()
     for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"):
         model.forward(pre_processed_profiling, use_cuda_graph=False)
-    # torch.cuda.synchronize()
     baseline_fps = CYCLES / (time.perf_counter() - start)
 
     start = time.perf_counter()
     for _ in tqdm(range(CYCLES), desc="With CUDA graphs"):
         model.forward(pre_processed_profiling, use_cuda_graph=True)
-    # torch.cuda.synchronize()
     cudagraph_fps = CYCLES / (time.perf_counter() - start)
 
-    expected_warmup = model.forward(pre_processed_warmup, use_cuda_graph=False)
-    expected_profiling = model.forward(pre_processed_profiling, use_cuda_graph=False)
-
-    print("Testing for race conditions (alternating inputs 20 times)...")
-    all_match = True
-    for i in range(20):
-        if i % 2 == 0:
-            result = model.forward(pre_processed_warmup, use_cuda_graph=True)
-            expected = expected_warmup
-            img_name = "warmup"
-        else:
-            result = model.forward(pre_processed_profiling, use_cuda_graph=True)
-            expected = expected_profiling
-            img_name = "profiling"
-
-        dets_match = torch.allclose(result[0], expected[0], atol=1e-6)
-        labels_match = torch.allclose(result[1], expected[1], atol=1e-6)
-        if not (dets_match and labels_match):
-            print(f"  MISMATCH at iteration {i} ({img_name}): dets={dets_match}, labels={labels_match}")
-            all_match = False
-
-    if all_match:
-        print("  All 20 iterations matched expected outputs.")
-
     print(f"\n{'='*50}")
     print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
     print(f"Forward pass FPS (CUDA graphs):    {cudagraph_fps:.1f}")
     print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x")
-    print(f"Race condition test: {'PASSED' if all_match else 'FAILED'}")
     print(f"{'='*50}")
 
 

From 470addb4816a2b1e938b7922f07ac3f87e8a716a Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 05:19:53 +0000
Subject: [PATCH 07/50] remove tqdm from profiling script

---
 .../profile_rfdetr_trt_cudagraphs.py          | 74 +++++++------------
 1 file changed, 28 insertions(+), 46 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index c60ecea6c5..5728b6eb4a 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -4,12 +4,11 @@
 import cv2
 import numpy as np
 import torch
-from tqdm import tqdm
 
 from inference_models import AutoModel
 
-IMAGE_PATH_WARMUP = os.environ.get("IMAGE_PATH_WARMUP", None)
-IMAGE_PATH_PROFILING = os.environ.get("IMAGE_PATH_PROFILING", None)
+IMAGE_1 = os.environ.get("IMAGE_PATH_WARMUP", None)
+IMAGE_2 = os.environ.get("IMAGE_PATH_PROFILING", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
 CYCLES = int(os.environ.get("CYCLES", "10_000"))
 WARMUP = int(os.environ.get("WARMUP", "50"))
@@ -21,62 +20,45 @@ def main() -> None:
         model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
     )
 
-    if IMAGE_PATH_WARMUP is not None:
-        image_warmup = cv2.imread(IMAGE_PATH_WARMUP)
+    if IMAGE_1 is not None:
+        image_1 = cv2.imread(IMAGE_1)
     else:
-        image_warmup = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+        image_1 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
 
-    if IMAGE_PATH_PROFILING is not None:
-        image_profiling = cv2.imread(IMAGE_PATH_PROFILING)
+    if IMAGE_2 is not None:
+        image_2 = cv2.imread(IMAGE_2)
     else:
-        image_profiling = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+        image_2 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
 
-    pre_processed_warmup, _ = model.pre_process(image_warmup)
-    pre_processed_profiling, _ = model.pre_process(image_profiling)
+    pre_processed_1, _ = model.pre_process(image_1)
+    pre_processed_2, _ = model.pre_process(image_2)
 
-    for _ in range(WARMUP):
-        model.forward(pre_processed_warmup, use_cuda_graph=False)
-        model.forward(pre_processed_warmup, use_cuda_graph=True)
 
-    expected_output_warmup_image = model.forward(
-        pre_processed_warmup, use_cuda_graph=False
-    )
-    expected_output_profiling_image = model.forward(
-        pre_processed_profiling, use_cuda_graph=False
-    )
+    expected_output_1_no_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=False)
+    expected_output_2_no_cuda_graph = model.forward(pre_processed_2, use_cuda_graph=False)
 
-    cudagraph_output_warmup_image = model.forward(
-        pre_processed_warmup, use_cuda_graph=True
-    )
-    cudagraph_output_profiling_image = model.forward(
-        pre_processed_profiling, use_cuda_graph=True
-    )
+    expected_output_1_capture_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=True)
+    expected_output_2_capture_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True)
 
-    assert torch.allclose(
-        expected_output_warmup_image[0], cudagraph_output_warmup_image[0], atol=1e-6
-    )
-    assert torch.allclose(
-        expected_output_profiling_image[0],
-        cudagraph_output_profiling_image[0],
-        atol=1e-6,
-    )
-    assert torch.allclose(
-        expected_output_warmup_image[1], cudagraph_output_warmup_image[1], atol=1e-6
-    )
-    assert torch.allclose(
-        expected_output_profiling_image[1],
-        cudagraph_output_profiling_image[1],
-        atol=1e-6,
-    )
+    expected_output_1_replayed_cudagraph = model.forward(pre_processed_1, use_cuda_graph=True)
+    expected_output_2_replayed_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True)
+
+    for i in [0, 1]:
+        assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_capture_cuda_graph[i], atol=1e-6)
+        assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_capture_cudagraph[i], atol=1e-6)
+        assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_replayed_cudagraph[i], atol=1e-6)
+        assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_replayed_cudagraph[i], atol=1e-6)
 
+    print("Timing without CUDA graphs...")
     start = time.perf_counter()
-    for _ in tqdm(range(CYCLES), desc="Without CUDA graphs"):
-        model.forward(pre_processed_profiling, use_cuda_graph=False)
+    for _ in range(CYCLES):
+        model.forward(pre_processed_2, use_cuda_graph=False)
     baseline_fps = CYCLES / (time.perf_counter() - start)
 
+    print("Timing with CUDA graphs...")
     start = time.perf_counter()
-    for _ in tqdm(range(CYCLES), desc="With CUDA graphs"):
-        model.forward(pre_processed_profiling, use_cuda_graph=True)
+    for _ in range(CYCLES):
+        model.forward(pre_processed_2, use_cuda_graph=True)
     cudagraph_fps = CYCLES / (time.perf_counter() - start)
 
     print(f"\n{'='*50}")

From 8cca2648109878d43376aff05e05bddcb3494c78 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 05:24:33 +0000
Subject: [PATCH 08/50] format

---
 .../profile_rfdetr_trt_cudagraphs.py          | 49 ++++++++++++++-----
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 5728b6eb4a..9f74e75ef2 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -33,21 +33,48 @@ def main() -> None:
     pre_processed_1, _ = model.pre_process(image_1)
     pre_processed_2, _ = model.pre_process(image_2)
 
+    expected_output_1_no_cuda_graph = model.forward(
+        pre_processed_1, use_cuda_graph=False
+    )
+    expected_output_2_no_cuda_graph = model.forward(
+        pre_processed_2, use_cuda_graph=False
+    )
 
-    expected_output_1_no_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=False)
-    expected_output_2_no_cuda_graph = model.forward(pre_processed_2, use_cuda_graph=False)
-
-    expected_output_1_capture_cuda_graph = model.forward(pre_processed_1, use_cuda_graph=True)
-    expected_output_2_capture_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True)
+    expected_output_1_capture_cuda_graph = model.forward(
+        pre_processed_1, use_cuda_graph=True
+    )
+    expected_output_2_capture_cudagraph = model.forward(
+        pre_processed_2, use_cuda_graph=True
+    )
 
-    expected_output_1_replayed_cudagraph = model.forward(pre_processed_1, use_cuda_graph=True)
-    expected_output_2_replayed_cudagraph = model.forward(pre_processed_2, use_cuda_graph=True)
+    expected_output_1_replayed_cudagraph = model.forward(
+        pre_processed_1, use_cuda_graph=True
+    )
+    expected_output_2_replayed_cudagraph = model.forward(
+        pre_processed_2, use_cuda_graph=True
+    )
 
     for i in [0, 1]:
-        assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_capture_cuda_graph[i], atol=1e-6)
-        assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_capture_cudagraph[i], atol=1e-6)
-        assert torch.allclose(expected_output_1_no_cuda_graph[i], expected_output_1_replayed_cudagraph[i], atol=1e-6)
-        assert torch.allclose(expected_output_2_no_cuda_graph[i], expected_output_2_replayed_cudagraph[i], atol=1e-6)
+        assert torch.allclose(
+            expected_output_1_no_cuda_graph[i],
+            expected_output_1_capture_cuda_graph[i],
+            atol=1e-6,
+        )
+        assert torch.allclose(
+            expected_output_2_no_cuda_graph[i],
+            expected_output_2_capture_cudagraph[i],
+            atol=1e-6,
+        )
+        assert torch.allclose(
+            expected_output_1_no_cuda_graph[i],
+            expected_output_1_replayed_cudagraph[i],
+            atol=1e-6,
+        )
+        assert torch.allclose(
+            expected_output_2_no_cuda_graph[i],
+            expected_output_2_replayed_cudagraph[i],
+            atol=1e-6,
+        )
 
     print("Timing without CUDA graphs...")
     start = time.perf_counter()

From 5b7d0a56e336b9c71b99355014a526b87dca472f Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 06:11:07 +0000
Subject: [PATCH 09/50] allow  flag to be passed to rfdetr-seg models even
 though there don't seem to be TRT packages for them yet.

---
 .../models/rfdetr/rfdetr_instance_segmentation_trt.py    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 78e2be9d50..752fe0ad82 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -172,6 +172,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
+        self._trt_cuda_graph_state = None
         self._lock = threading.Lock()
 
     @property
@@ -195,11 +196,11 @@ def pre_process(
         )
 
     def forward(
-        self, pre_processed_images: torch.Tensor, **kwargs
+        self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                (detections, labels, masks), _ = infer_from_trt_engine(
+                (detections, labels, masks), trt_cuda_graph_state = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -207,7 +208,11 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
+                    use_cuda_graph=use_cuda_graph,
+                    trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None,
                 )
+                if use_cuda_graph:
+                    self._trt_cuda_graph_state = trt_cuda_graph_state
                 return detections, labels, masks
 
     def post_process(

From a27ae376909abb3df1b3443ddf579d72ff5b4596 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 16:41:01 +0000
Subject: [PATCH 10/50] reduce number of diffed files

---
 .../inference_models/models/common/trt.py     | 72 ++++++++++++++++---
 .../deep_lab_v3_plus_segmentation_trt.py      |  5 +-
 .../resnet/resnet_classification_trt.py       | 10 ++-
 .../rfdetr_instance_segmentation_trt.py       | 35 +++++----
 .../rfdetr/rfdetr_object_detection_trt.py     | 35 +++++----
 .../models/vit/vit_classification_trt.py      | 10 ++-
 .../yolact_instance_segmentation_trt.py       |  2 +-
 .../yolonas/yolonas_object_detection_trt.py   |  2 +-
 .../yolov10/yolov10_object_detection_trt.py   |  5 +-
 .../yolov5_instance_segmentation_trt.py       |  2 +-
 .../yolov5/yolov5_object_detection_trt.py     |  5 +-
 .../yolov7_instance_segmentation_trt.py       |  2 +-
 .../yolov8_instance_segmentation_trt.py       |  2 +-
 .../yolov8/yolov8_key_points_detection_trt.py |  5 +-
 .../yolov8/yolov8_object_detection_trt.py     |  5 +-
 15 files changed, 133 insertions(+), 64 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 19767e89c0..6301a68a8c 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -140,9 +140,7 @@ def infer_from_trt_engine(
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    use_cuda_graph: bool = False,
-    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+) -> List[torch.Tensor]:
     """Run inference using a TensorRT engine.
 
     Executes inference on preprocessed images using a TensorRT engine and execution
@@ -235,7 +233,7 @@ def infer_from_trt_engine(
         - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names
     """
     if trt_config.static_batch_size is not None:
-        return infer_from_trt_engine_with_batch_size_boundaries(
+        results, _ = _infer_from_trt_engine_with_batch_size_boundaries(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
@@ -244,10 +242,68 @@ def infer_from_trt_engine(
             outputs=outputs,
             min_batch_size=trt_config.static_batch_size,
             max_batch_size=trt_config.static_batch_size,
-            use_cuda_graph=use_cuda_graph,
+            use_cuda_graph=False,
+            trt_cuda_graph_state=None,
+        )
+        return results
+    results, _ = _infer_from_trt_engine_with_batch_size_boundaries(
+        pre_processed_images=pre_processed_images,
+        engine=engine,
+        context=context,
+        device=device,
+        input_name=input_name,
+        outputs=outputs,
+        min_batch_size=trt_config.dynamic_batch_size_min,
+        max_batch_size=trt_config.dynamic_batch_size_max,
+        use_cuda_graph=False,
+        trt_cuda_graph_state=None,
+    )
+    return results
+
+
+def infer_from_trt_engine_with_cudagraph(
+    pre_processed_images: torch.Tensor,
+    trt_config: TRTConfig,
+    engine: trt.ICudaEngine,
+    context: trt.IExecutionContext,
+    device: torch.device,
+    input_name: str,
+    outputs: List[str],
+    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+    """Run inference using a TensorRT engine with CUDA graph support.
+
+    Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for
+    improved performance on repeated inference with the same input shape.
+
+    Args:
+        pre_processed_images: Preprocessed input tensor on CUDA device.
+        trt_config: TensorRT configuration object.
+        engine: TensorRT CUDA engine (ICudaEngine).
+        context: TensorRT execution context (IExecutionContext).
+        device: PyTorch CUDA device.
+        input_name: Name of the input tensor in the TensorRT engine.
+        outputs: List of output tensor names.
+        trt_cuda_graph_state: Optional state from a previous call for graph replay.
+
+    Returns:
+        Tuple of (results, trt_cuda_graph_state) where results is the list of
+        output tensors and trt_cuda_graph_state can be passed to subsequent calls.
+    """
+    if trt_config.static_batch_size is not None:
+        return _infer_from_trt_engine_with_batch_size_boundaries(
+            pre_processed_images=pre_processed_images,
+            engine=engine,
+            context=context,
+            device=device,
+            input_name=input_name,
+            outputs=outputs,
+            min_batch_size=trt_config.static_batch_size,
+            max_batch_size=trt_config.static_batch_size,
+            use_cuda_graph=True,
             trt_cuda_graph_state=trt_cuda_graph_state,
         )
-    return infer_from_trt_engine_with_batch_size_boundaries(
+    return _infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
         context=context,
@@ -256,12 +312,12 @@ def infer_from_trt_engine(
         outputs=outputs,
         min_batch_size=trt_config.dynamic_batch_size_min,
         max_batch_size=trt_config.dynamic_batch_size_max,
-        use_cuda_graph=use_cuda_graph,
+        use_cuda_graph=True,
         trt_cuda_graph_state=trt_cuda_graph_state,
     )
 
 
-def infer_from_trt_engine_with_batch_size_boundaries(
+def _infer_from_trt_engine_with_batch_size_boundaries(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
     context: trt.IExecutionContext,
diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
index 050908b1fe..9976a86761 100644
--- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
+++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -193,8 +193,7 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
index e0b2621a55..34de7058e3 100644
--- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py
+++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -193,8 +193,7 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
@@ -336,7 +335,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -344,8 +343,7 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 752fe0ad82..30d3533199 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -32,6 +32,7 @@
 from inference_models.models.common.trt import (
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
+    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
     TRTCudaGraphState,
 )
@@ -200,19 +201,29 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                (detections, labels, masks), trt_cuda_graph_state = infer_from_trt_engine(
-                    pre_processed_images=pre_processed_images,
-                    trt_config=self._trt_config,
-                    engine=self._engine,
-                    context=self._execution_context,
-                    device=self._device,
-                    input_name=self._input_name,
-                    outputs=self._output_names,
-                    use_cuda_graph=use_cuda_graph,
-                    trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None,
-                )
                 if use_cuda_graph:
-                    self._trt_cuda_graph_state = trt_cuda_graph_state
+                    (detections, labels, masks), self._trt_cuda_graph_state = (
+                        infer_from_trt_engine_with_cudagraph(
+                            pre_processed_images=pre_processed_images,
+                            trt_config=self._trt_config,
+                            engine=self._engine,
+                            context=self._execution_context,
+                            device=self._device,
+                            input_name=self._input_name,
+                            outputs=self._output_names,
+                            trt_cuda_graph_state=self._trt_cuda_graph_state,
+                        )
+                    )
+                else:
+                    detections, labels, masks = infer_from_trt_engine(
+                        pre_processed_images=pre_processed_images,
+                        trt_config=self._trt_config,
+                        engine=self._engine,
+                        context=self._execution_context,
+                        device=self._device,
+                        input_name=self._input_name,
+                        outputs=self._output_names,
+                    )
                 return detections, labels, masks
 
     def post_process(
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index b65d03348d..c5db6bbcf4 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -35,6 +35,7 @@
 from inference_models.models.common.trt import (
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
+    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
     TRTCudaGraphState,
 )
@@ -203,19 +204,29 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                (detections, labels), trt_cuda_graph_state = infer_from_trt_engine(
-                    pre_processed_images=pre_processed_images,
-                    trt_config=self._trt_config,
-                    engine=self._engine,
-                    context=self._execution_context,
-                    device=self._device,
-                    input_name=self._input_name,
-                    outputs=self._output_names,
-                    use_cuda_graph=use_cuda_graph,
-                    trt_cuda_graph_state=self._trt_cuda_graph_state if use_cuda_graph else None,
-                )
                 if use_cuda_graph:
-                    self._trt_cuda_graph_state = trt_cuda_graph_state
+                    (detections, labels), self._trt_cuda_graph_state = (
+                        infer_from_trt_engine_with_cudagraph(
+                            pre_processed_images=pre_processed_images,
+                            trt_config=self._trt_config,
+                            engine=self._engine,
+                            context=self._execution_context,
+                            device=self._device,
+                            input_name=self._input_name,
+                            outputs=self._output_names,
+                            trt_cuda_graph_state=self._trt_cuda_graph_state,
+                        )
+                    )
+                else:
+                    detections, labels = infer_from_trt_engine(
+                        pre_processed_images=pre_processed_images,
+                        trt_config=self._trt_config,
+                        engine=self._engine,
+                        context=self._execution_context,
+                        device=self._device,
+                        input_name=self._input_name,
+                        outputs=self._output_names,
+                    )
                 return detections, labels
 
     def post_process(
diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py
index 3a0892a8c5..d04a90607e 100644
--- a/inference_models/inference_models/models/vit/vit_classification_trt.py
+++ b/inference_models/inference_models/models/vit/vit_classification_trt.py
@@ -183,7 +183,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -191,8 +191,7 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
@@ -332,7 +331,7 @@ def forward(
     ) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -340,8 +339,7 @@ def forward(
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
index ea6ebe6cf0..cc3cbeedaf 100644
--- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
@@ -192,7 +192,7 @@ def forward(
                     all_proto_data,
                 ) = ([], [], [], [], [])
                 for image in pre_processed_images:
-                    (loc_data, conf_data, mask_data, prior_data, proto_data), _ = (
+                    loc_data, conf_data, mask_data, prior_data, proto_data = (
                         infer_from_trt_engine(
                             pre_processed_images=image.unsqueeze(0).contiguous(),
                             trt_config=self._trt_config,
diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
index 39822ff34b..6f561d58e4 100644
--- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
@@ -187,7 +187,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                results = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
index ff25e019c2..ec2dcf3cdb 100644
--- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
@@ -177,7 +177,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -185,8 +185,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
index a18b743b90..ee7180e10d 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
@@ -187,7 +187,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                (instances, protos), _ = infer_from_trt_engine(
+                instances, protos = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
index a423033cba..ff7d376e07 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
@@ -175,7 +175,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -183,8 +183,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
index abcc82a78c..15ef8a13ee 100644
--- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
@@ -185,7 +185,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                (instances, protos), _ = infer_from_trt_engine(
+                instances, protos = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
index f2cf1d7953..8a32a117ae 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
@@ -195,7 +195,7 @@ def forward(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                (instances, protos), _ = infer_from_trt_engine(
+                instances, protos = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
index 12e8630c1f..898beebb04 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
@@ -210,7 +210,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -218,8 +218,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index 29b9d7bfe8..aef07c3fad 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -183,7 +183,7 @@ def pre_process(
     def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                results, _ = infer_from_trt_engine(
+                return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
@@ -191,8 +191,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
-                )
-                return results[0]
+                )[0]
 
     def post_process(
         self,

From 04c015aef0adf53c9779424eae078ccd73f218f9 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 16:53:14 +0000
Subject: [PATCH 11/50] don't rename existing function

---
 inference_models/inference_models/models/common/trt.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 6301a68a8c..b836131abe 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -233,7 +233,7 @@ def infer_from_trt_engine(
         - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names
     """
     if trt_config.static_batch_size is not None:
-        results, _ = _infer_from_trt_engine_with_batch_size_boundaries(
+        results, _ = infer_from_trt_engine_with_batch_size_boundaries(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
@@ -246,7 +246,7 @@ def infer_from_trt_engine(
             trt_cuda_graph_state=None,
         )
         return results
-    results, _ = _infer_from_trt_engine_with_batch_size_boundaries(
+    results, _ = infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
         context=context,
@@ -291,7 +291,7 @@ def infer_from_trt_engine_with_cudagraph(
         output tensors and trt_cuda_graph_state can be passed to subsequent calls.
     """
     if trt_config.static_batch_size is not None:
-        return _infer_from_trt_engine_with_batch_size_boundaries(
+        return infer_from_trt_engine_with_batch_size_boundaries(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
@@ -303,7 +303,7 @@ def infer_from_trt_engine_with_cudagraph(
             use_cuda_graph=True,
             trt_cuda_graph_state=trt_cuda_graph_state,
         )
-    return _infer_from_trt_engine_with_batch_size_boundaries(
+    return infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
         context=context,
@@ -317,7 +317,7 @@ def infer_from_trt_engine_with_cudagraph(
     )
 
 
-def _infer_from_trt_engine_with_batch_size_boundaries(
+def infer_from_trt_engine_with_batch_size_boundaries(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
     context: trt.IExecutionContext,

From ac50a1a8044fd600d9c331064d6d21487bd9054b Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 20:56:44 +0000
Subject: [PATCH 12/50] add proper integration test and simplify profiling
 script

---
 .../profile_rfdetr_trt_cudagraphs.py          | 66 +++-----------
 .../integration_tests/models/conftest.py      | 10 +++
 .../models/test_rfdetr_predictions_trt.py     | 86 +++++++++++++++++++
 3 files changed, 106 insertions(+), 56 deletions(-)
 create mode 100644 inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 9f74e75ef2..e63f5afd8b 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -7,8 +7,7 @@
 
 from inference_models import AutoModel
 
-IMAGE_1 = os.environ.get("IMAGE_PATH_WARMUP", None)
-IMAGE_2 = os.environ.get("IMAGE_PATH_PROFILING", None)
+IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
 CYCLES = int(os.environ.get("CYCLES", "10_000"))
 WARMUP = int(os.environ.get("WARMUP", "50"))
@@ -20,72 +19,27 @@ def main() -> None:
         model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
     )
 
-    if IMAGE_1 is not None:
-        image_1 = cv2.imread(IMAGE_1)
+    if IMAGE_PATH is not None:
+        image = cv2.imread(IMAGE_PATH)
     else:
-        image_1 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+        image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
 
-    if IMAGE_2 is not None:
-        image_2 = cv2.imread(IMAGE_2)
-    else:
-        image_2 = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
-
-    pre_processed_1, _ = model.pre_process(image_1)
-    pre_processed_2, _ = model.pre_process(image_2)
-
-    expected_output_1_no_cuda_graph = model.forward(
-        pre_processed_1, use_cuda_graph=False
-    )
-    expected_output_2_no_cuda_graph = model.forward(
-        pre_processed_2, use_cuda_graph=False
-    )
-
-    expected_output_1_capture_cuda_graph = model.forward(
-        pre_processed_1, use_cuda_graph=True
-    )
-    expected_output_2_capture_cudagraph = model.forward(
-        pre_processed_2, use_cuda_graph=True
-    )
-
-    expected_output_1_replayed_cudagraph = model.forward(
-        pre_processed_1, use_cuda_graph=True
-    )
-    expected_output_2_replayed_cudagraph = model.forward(
-        pre_processed_2, use_cuda_graph=True
-    )
+    pre_processed, _ = model.pre_process(image)
 
-    for i in [0, 1]:
-        assert torch.allclose(
-            expected_output_1_no_cuda_graph[i],
-            expected_output_1_capture_cuda_graph[i],
-            atol=1e-6,
-        )
-        assert torch.allclose(
-            expected_output_2_no_cuda_graph[i],
-            expected_output_2_capture_cudagraph[i],
-            atol=1e-6,
-        )
-        assert torch.allclose(
-            expected_output_1_no_cuda_graph[i],
-            expected_output_1_replayed_cudagraph[i],
-            atol=1e-6,
-        )
-        assert torch.allclose(
-            expected_output_2_no_cuda_graph[i],
-            expected_output_2_replayed_cudagraph[i],
-            atol=1e-6,
-        )
+    for _ in range(WARMUP):
+        model.forward(pre_processed, use_cuda_graph=False)
+        model.forward(pre_processed, use_cuda_graph=True)
 
     print("Timing without CUDA graphs...")
     start = time.perf_counter()
     for _ in range(CYCLES):
-        model.forward(pre_processed_2, use_cuda_graph=False)
+        model.forward(pre_processed, use_cuda_graph=False)
     baseline_fps = CYCLES / (time.perf_counter() - start)
 
     print("Timing with CUDA graphs...")
     start = time.perf_counter()
     for _ in range(CYCLES):
-        model.forward(pre_processed_2, use_cuda_graph=True)
+        model.forward(pre_processed, use_cuda_graph=True)
     cudagraph_fps = CYCLES / (time.perf_counter() - start)
 
     print(f"\n{'='*50}")
diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py
index 5d7bbcddf4..71ca66f6db 100644
--- a/inference_models/tests/integration_tests/models/conftest.py
+++ b/inference_models/tests/integration_tests/models/conftest.py
@@ -164,6 +164,8 @@
     "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/sam2.zip"
 )
 
+RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip"
+
 
 @pytest.fixture(scope="module")
 def original_clip_download_dir() -> str:
@@ -388,6 +390,14 @@ def coin_counting_rfdetr_nano_torch_static_crop_center_crop_package() -> str:
     )
 
 
+@pytest.fixture(scope="module")
+def rfdetr_nano_t4_trt_package() -> str:
+    return download_model_package(
+        model_package_zip_url=RFDETR_NANO_T4_TRT_PACKAGE_URL,
+        package_name="rfdetr-nano-t4-trt",
+    )
+
+
 @pytest.fixture(scope="module")
 def og_rfdetr_base_weights() -> str:
     package_path = os.path.join(MODELS_DIR, "og-rfdetr-base")
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
new file mode 100644
index 0000000000..e50d6cd030
--- /dev/null
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -0,0 +1,86 @@
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_cudagraph_output_matches_non_cudagraph_output(
+    rfdetr_nano_t4_trt_package: str,
+    dog_image_numpy: np.ndarray,
+    bike_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path=rfdetr_nano_t4_trt_package,
+        device=torch.device("cuda:0"),
+    )
+
+    pre_processed_1, _ = model.pre_process(dog_image_numpy)
+    pre_processed_2, _ = model.pre_process(bike_image_numpy)
+
+    outputs = []
+    for pre_processed in [pre_processed_1, pre_processed_2]:
+        no_graph = model.forward(pre_processed, use_cuda_graph=False)
+        model._trt_cuda_graph_state = None
+        capture_graph = model.forward(pre_processed, use_cuda_graph=True)
+        replay_graph = model.forward(pre_processed, use_cuda_graph=True)
+
+        outputs.append((no_graph, capture_graph, replay_graph))
+
+    for image_outputs in outputs:
+        no_graph, capture_graph, replay_graph = image_outputs
+        for result_idx in range(2):
+            assert torch.allclose(
+                no_graph[result_idx],
+                capture_graph[result_idx],
+                atol=1e-6,
+            )
+            assert torch.allclose(
+                no_graph[result_idx],
+                replay_graph[result_idx],
+                atol=1e-6,
+            )
+
+    # make sure that the allcloses aren't true because of buffer aliasing or something weird
+    # outputs should be different between images and the same between execution branches.
+    for execution_branch_idx in range(3):
+        for result_idx in range(2):
+            assert not torch.allclose(
+                outputs[0][execution_branch_idx][result_idx],
+                outputs[1][execution_branch_idx][result_idx],
+                atol=1e-6,
+            )
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_outputs_shapes(
+    rfdetr_nano_t4_trt_package: str,
+    dog_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path=rfdetr_nano_t4_trt_package,
+        device=torch.device("cuda:0"),
+    )
+
+    pre_processed, _ = model.pre_process(dog_image_numpy)
+
+    output = model.forward(pre_processed, use_cuda_graph=False)
+
+    assert output[0].shape == (1, 300, 4)
+    assert output[1].shape == (1, 300, 91)
+
+    output = model.forward(pre_processed, use_cuda_graph=True) # capture
+
+    assert output[0].shape == (1, 300, 4)
+    assert output[1].shape == (1, 300, 91)
+
+    output = model.forward(pre_processed, use_cuda_graph=True) # replay
+
+    assert output[0].shape == (1, 300, 4)
+    assert output[1].shape == (1, 300, 91)
\ No newline at end of file

From 951222950742ba91f1856285a980031b7858e4bb Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Fri, 23 Jan 2026 22:36:43 +0000
Subject: [PATCH 13/50] profile how long it takes to capture cuda graph

---
 .../profile_rfdetr_trt_cudagraphs.py          | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index e63f5afd8b..14044e849c 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -4,12 +4,13 @@
 import cv2
 import numpy as np
 import torch
+from tqdm import tqdm
 
 from inference_models import AutoModel
 
 IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
-CYCLES = int(os.environ.get("CYCLES", "10_000"))
+CYCLES = int(os.environ.get("CYCLES", "100"))
 WARMUP = int(os.environ.get("WARMUP", "50"))
 
 
@@ -36,16 +37,27 @@ def main() -> None:
         model.forward(pre_processed, use_cuda_graph=False)
     baseline_fps = CYCLES / (time.perf_counter() - start)
 
-    print("Timing with CUDA graphs...")
+    print("Timing with forced CUDA graph recapture each step...")
+    start = time.perf_counter()
+    for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
+        model._trt_cuda_graph_state = None
+        model.forward(pre_processed, use_cuda_graph=True)
+       
+    cudagraph_recapture_fps = CYCLES / (time.perf_counter() - start)
+
+    print("Timing with CUDA graph caching and replaying...")
+    model.forward(pre_processed, use_cuda_graph=True) # initial capture
     start = time.perf_counter()
     for _ in range(CYCLES):
         model.forward(pre_processed, use_cuda_graph=True)
-    cudagraph_fps = CYCLES / (time.perf_counter() - start)
+    cudagraph_replay_fps = CYCLES / (time.perf_counter() - start)
 
     print(f"\n{'='*50}")
     print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
-    print(f"Forward pass FPS (CUDA graphs):    {cudagraph_fps:.1f}")
-    print(f"Speedup: {cudagraph_fps / baseline_fps:.2f}x")
+    print(f"Forward pass FPS (CUDA graphs recapture):    {cudagraph_recapture_fps:.1f}")
+    print(f"Speed factor (recapture): {cudagraph_recapture_fps / baseline_fps:.2f}x")
+    print(f"Forward pass FPS (CUDA graphs replay):    {cudagraph_replay_fps:.1f}")
+    print(f"Speed factor (replay): {cudagraph_replay_fps / baseline_fps:.2f}x")
     print(f"{'='*50}")
 
 

From d5b51f91ff3be38fead38631e899ce2dbd423fc2 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Feb 2026 02:35:49 +0000
Subject: [PATCH 14/50] add LRU (shape, device, dtype) caching for CUDA graphs

---
 .../inference_models/models/common/trt.py     | 121 +++++++++++-------
 .../rfdetr_instance_segmentation_trt.py       |   9 +-
 .../rfdetr/rfdetr_object_detection_trt.py     |   9 +-
 3 files changed, 80 insertions(+), 59 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index b836131abe..6683c14cb2 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -1,5 +1,6 @@
 from typing import List, Tuple, Optional
 from dataclasses import dataclass
+from collections import OrderedDict
 
 import torch
 
@@ -35,7 +36,6 @@
 
 
 class InferenceTRTLogger(trt.ILogger):
-
     def __init__(self, with_memory: bool = False):
         super().__init__()
         self._memory: List[Tuple[trt.ILogger.Severity, str]] = []
@@ -66,8 +66,35 @@ class TRTCudaGraphState:
     input_buffer: torch.Tensor
     output_buffers: List[torch.Tensor]
 
-    def has_changed_shape(self, input_shape: Tuple[int, ...]) -> bool:
-        return tuple(self.input_buffer.shape) != input_shape
+
+class TRTCudaGraphLRUCache:
+    def __init__(self, capacity: int = 64):
+        self.cache: OrderedDict[
+            Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState
+        ] = OrderedDict()
+        self.capacity = capacity
+
+    def __contains__(
+        self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
+    ) -> bool:
+        return key in self.cache
+
+    def __getitem__(
+        self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
+    ) -> TRTCudaGraphState:
+        value = self.cache[key]
+        self.cache.move_to_end(key)
+        return value
+
+    def __setitem__(
+        self,
+        key: Tuple[Tuple[int, ...], torch.dtype, torch.device],
+        value: TRTCudaGraphState,
+    ):
+        self.cache[key] = value
+        self.cache.move_to_end(key)
+        if len(self.cache) > self.capacity:
+            self.cache.popitem(last=False)
 
 
 def get_trt_engine_inputs_and_outputs(
@@ -243,7 +270,7 @@ def infer_from_trt_engine(
             min_batch_size=trt_config.static_batch_size,
             max_batch_size=trt_config.static_batch_size,
             use_cuda_graph=False,
-            trt_cuda_graph_state=None,
+            trt_cuda_graph_cache=None,
         )
         return results
     results, _ = infer_from_trt_engine_with_batch_size_boundaries(
@@ -256,7 +283,7 @@ def infer_from_trt_engine(
         min_batch_size=trt_config.dynamic_batch_size_min,
         max_batch_size=trt_config.dynamic_batch_size_max,
         use_cuda_graph=False,
-        trt_cuda_graph_state=None,
+        trt_cuda_graph_cache=None,
     )
     return results
 
@@ -269,8 +296,8 @@ def infer_from_trt_engine_with_cudagraph(
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
     """Run inference using a TensorRT engine with CUDA graph support.
 
     Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for
@@ -284,11 +311,11 @@ def infer_from_trt_engine_with_cudagraph(
         device: PyTorch CUDA device.
         input_name: Name of the input tensor in the TensorRT engine.
         outputs: List of output tensor names.
-        trt_cuda_graph_state: Optional state from a previous call for graph replay.
+        trt_cuda_graph_cache: Optional state from a previous call for graph replay.
 
     Returns:
-        Tuple of (results, trt_cuda_graph_state) where results is the list of
-        output tensors and trt_cuda_graph_state can be passed to subsequent calls.
+        Tuple of (results, trt_cuda_graph_cache) where results is the list of
+        output tensors and trt_cuda_graph_cache can be passed to subsequent calls.
     """
     if trt_config.static_batch_size is not None:
         return infer_from_trt_engine_with_batch_size_boundaries(
@@ -301,7 +328,7 @@ def infer_from_trt_engine_with_cudagraph(
             min_batch_size=trt_config.static_batch_size,
             max_batch_size=trt_config.static_batch_size,
             use_cuda_graph=True,
-            trt_cuda_graph_state=trt_cuda_graph_state,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
     return infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
@@ -313,7 +340,7 @@ def infer_from_trt_engine_with_cudagraph(
         min_batch_size=trt_config.dynamic_batch_size_min,
         max_batch_size=trt_config.dynamic_batch_size_max,
         use_cuda_graph=True,
-        trt_cuda_graph_state=trt_cuda_graph_state,
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
 
@@ -327,8 +354,8 @@ def infer_from_trt_engine_with_batch_size_boundaries(
     min_batch_size: int,
     max_batch_size: int,
     use_cuda_graph: bool = False,
-    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
     if pre_processed_images.shape[0] <= max_batch_size:
         reminder = min_batch_size - pre_processed_images.shape[0]
         if reminder > 0:
@@ -343,7 +370,7 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results, trt_cuda_graph_state = execute_trt_engine(
+        results, trt_cuda_graph_cache = execute_trt_engine(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
@@ -351,11 +378,11 @@ def infer_from_trt_engine_with_batch_size_boundaries(
             input_name=input_name,
             outputs=outputs,
             use_cuda_graph=use_cuda_graph,
-            trt_cuda_graph_state=trt_cuda_graph_state,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
-        return results, trt_cuda_graph_state
+        return results, trt_cuda_graph_cache
     all_results = []
     for _ in outputs:
         all_results.append([])
@@ -374,7 +401,7 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results, trt_cuda_graph_state = execute_trt_engine(
+        results, trt_cuda_graph_cache = execute_trt_engine(
             pre_processed_images=batch,
             engine=engine,
             context=context,
@@ -382,13 +409,13 @@ def infer_from_trt_engine_with_batch_size_boundaries(
             input_name=input_name,
             outputs=outputs,
             use_cuda_graph=use_cuda_graph,
-            trt_cuda_graph_state=trt_cuda_graph_state,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
         for partial_result, all_result_element in zip(results, all_results):
             all_result_element.append(partial_result)
-    return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_state
+    return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_cache
 
 
 def execute_trt_engine(
@@ -399,17 +426,20 @@ def execute_trt_engine(
     input_name: str,
     outputs: List[str],
     use_cuda_graph: bool = False,
-    trt_cuda_graph_state: Optional[TRTCudaGraphState] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphState]]:
+    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
+    if use_cuda_graph:
+        if trt_cuda_graph_cache is None:
+            trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=64)
 
-    if trt_cuda_graph_state is not None:
         input_shape = tuple(pre_processed_images.shape)
-        if trt_cuda_graph_state.has_changed_shape(input_shape):
-            LOGGER.warning(
-                f"Input shape changed from {tuple(trt_cuda_graph_state.input_buffer.shape)} "
-                f"to {input_shape}. Recapturing CUDA graph."
-            )
-            return _capture_cuda_graph(
+        input_dtype = pre_processed_images.dtype
+        cache_key = (input_shape, input_dtype, device)
+
+        if cache_key not in trt_cuda_graph_cache:
+            LOGGER.debug(f"Capturing CUDA graph for shape {input_shape}")
+
+            results, trt_cuda_graph = _capture_cuda_graph(
                 pre_processed_images=pre_processed_images,
                 engine=engine,
                 context=context,
@@ -417,25 +447,18 @@ def execute_trt_engine(
                 input_name=input_name,
                 outputs=outputs,
             )
+            trt_cuda_graph_cache[cache_key] = trt_cuda_graph
+            return results, trt_cuda_graph_cache
 
-        stream = trt_cuda_graph_state.cuda_stream
-        with torch.cuda.stream(stream):
-            trt_cuda_graph_state.input_buffer.copy_(pre_processed_images)
-            trt_cuda_graph_state.cuda_graph.replay()
-            results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
-        stream.synchronize()
-
-        return results, trt_cuda_graph_state
-
-    elif use_cuda_graph:
-        return _capture_cuda_graph(
-            pre_processed_images=pre_processed_images,
-            engine=engine,
-            context=context,
-            device=device,
-            input_name=input_name,
-            outputs=outputs,
-        )
+        else:
+            trt_cuda_graph_state = trt_cuda_graph_cache[cache_key]
+            stream = trt_cuda_graph_state.cuda_stream
+            with torch.cuda.stream(stream):
+                trt_cuda_graph_state.input_buffer.copy_(pre_processed_images)
+                trt_cuda_graph_state.cuda_graph.replay()
+                results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
+            stream.synchronize()
+            return results, trt_cuda_graph_cache
 
     else:
         batch_size = pre_processed_images.shape[0]
@@ -533,14 +556,14 @@ def _capture_cuda_graph(
         results = [buf.clone() for buf in output_buffers]
     stream.synchronize()
 
-    trt_cuda_graph_state = TRTCudaGraphState(
+    trt_cuda_graph_cache = TRTCudaGraphState(
         cuda_graph=cuda_graph,
         cuda_stream=stream,
         input_buffer=input_buffer,
         output_buffers=output_buffers,
     )
 
-    return results, trt_cuda_graph_state
+    return results, trt_cuda_graph_cache
 
 
 def trt_dtype_to_torch(trt_dtype):
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 30d3533199..550dfa3b55 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -34,7 +34,7 @@
     infer_from_trt_engine,
     infer_from_trt_engine_with_cudagraph,
     load_trt_model,
-    TRTCudaGraphState,
+    TRTCudaGraphLRUCache,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -73,7 +73,6 @@ class RFDetrForInstanceSegmentationTRT(
         Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
     ]
 ):
-
     @classmethod
     def from_pretrained(
         cls,
@@ -173,7 +172,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_state = None
+        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
         self._lock = threading.Lock()
 
     @property
@@ -202,7 +201,7 @@ def forward(
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 if use_cuda_graph:
-                    (detections, labels, masks), self._trt_cuda_graph_state = (
+                    (detections, labels, masks), self._trt_cuda_graph_cache = (
                         infer_from_trt_engine_with_cudagraph(
                             pre_processed_images=pre_processed_images,
                             trt_config=self._trt_config,
@@ -211,7 +210,7 @@ def forward(
                             device=self._device,
                             input_name=self._input_name,
                             outputs=self._output_names,
-                            trt_cuda_graph_state=self._trt_cuda_graph_state,
+                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
                         )
                     )
                 else:
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index c5db6bbcf4..0e12ea5d33 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -37,7 +37,7 @@
     infer_from_trt_engine,
     infer_from_trt_engine_with_cudagraph,
     load_trt_model,
-    TRTCudaGraphState,
+    TRTCudaGraphLRUCache,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -73,7 +73,6 @@ class RFDetrForObjectDetectionTRT(
         ]
     )
 ):
-
     @classmethod
     def from_pretrained(
         cls,
@@ -178,7 +177,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_state = None
+        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
         self._lock = threading.Lock()
 
     @property
@@ -205,7 +204,7 @@ def forward(
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 if use_cuda_graph:
-                    (detections, labels), self._trt_cuda_graph_state = (
+                    (detections, labels), self._trt_cuda_graph_cache = (
                         infer_from_trt_engine_with_cudagraph(
                             pre_processed_images=pre_processed_images,
                             trt_config=self._trt_config,
@@ -214,7 +213,7 @@ def forward(
                             device=self._device,
                             input_name=self._input_name,
                             outputs=self._output_names,
-                            trt_cuda_graph_state=self._trt_cuda_graph_state,
+                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
                         )
                     )
                 else:

From dbd45f967e4da25a3c60743c826ad65096173d10 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Feb 2026 03:03:13 +0000
Subject: [PATCH 15/50] add USE_CUDA_GRAPHS_FOR_TRT_BACKEND environment
 variable which defaults to True and reference in RFDETR TRT classes

---
 inference_models/inference_models/configuration.py  |  5 +++++
 .../rfdetr/rfdetr_instance_segmentation_trt.py      | 13 +++++++++++--
 .../models/rfdetr/rfdetr_object_detection_trt.py    | 13 +++++++++++--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index e2bd370872..a3170b465d 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -83,3 +83,8 @@
 ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA = os.getenv(
     "ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA"
 )
+
+USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env(
+    variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND",
+    default=True,
+)
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 550dfa3b55..dba576f7b3 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -5,7 +5,10 @@
 import torch
 
 from inference_models import InstanceDetections, InstanceSegmentationModel
-from inference_models.configuration import DEFAULT_DEVICE
+from inference_models.configuration import (
+    DEFAULT_DEVICE,
+    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
+)
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
     CorruptedModelPackageError,
@@ -196,8 +199,14 @@ def pre_process(
         )
 
     def forward(
-        self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs
+        self,
+        pre_processed_images: torch.Tensor,
+        use_cuda_graph: Optional[bool] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        if use_cuda_graph is None:
+            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
+
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 if use_cuda_graph:
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index 0e12ea5d33..4503454e00 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -5,7 +5,10 @@
 import torch
 
 from inference_models import Detections, ObjectDetectionModel
-from inference_models.configuration import DEFAULT_DEVICE
+from inference_models.configuration import (
+    DEFAULT_DEVICE,
+    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
+)
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
     CorruptedModelPackageError,
@@ -199,8 +202,14 @@ def pre_process(
         )
 
     def forward(
-        self, pre_processed_images: torch.Tensor, use_cuda_graph: bool = False, **kwargs
+        self,
+        pre_processed_images: torch.Tensor,
+        use_cuda_graph: Optional[bool] = None,
+        **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if use_cuda_graph is None:
+            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
+
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 if use_cuda_graph:

From 9502b8e1c7d7016c25bac98c37314768b60165c0 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Feb 2026 03:10:20 +0000
Subject: [PATCH 16/50] fix bug in profiling script

---
 .../development/profiling/profile_rfdetr_trt_cudagraphs.py  | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 14044e849c..733d462216 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -10,7 +10,7 @@
 
 IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
-CYCLES = int(os.environ.get("CYCLES", "100"))
+CYCLES = int(os.environ.get("CYCLES", "10_000"))
 WARMUP = int(os.environ.get("WARMUP", "50"))
 
 
@@ -40,10 +40,10 @@ def main() -> None:
     print("Timing with forced CUDA graph recapture each step...")
     start = time.perf_counter()
     for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
-        model._trt_cuda_graph_state = None
+        model._trt_cuda_graph_cache = None
         model.forward(pre_processed, use_cuda_graph=True)
        
-    cudagraph_recapture_fps = CYCLES / (time.perf_counter() - start)
+    cudagraph_recapture_fps = 100 / (time.perf_counter() - start)
 
     print("Timing with CUDA graph caching and replaying...")
     model.forward(pre_processed, use_cuda_graph=True) # initial capture

From cb705381ee15b31bc889f9d158293e102f63b9ba Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Feb 2026 00:03:12 +0000
Subject: [PATCH 17/50] use yolov8 with dynamic batch size to test shape
 caching for CUDA graphs

---
 .../inference_models/models/common/trt.py     | 49 ++++++------
 .../yolov8/yolov8_object_detection_trt.py     | 51 +++++++++---
 .../integration_tests/models/conftest.py      |  9 +++
 .../models/test_rfdetr_predictions_trt.py     |  2 +-
 ...yolov8_object_detection_predictions_trt.py | 77 +++++++++++++++++++
 5 files changed, 150 insertions(+), 38 deletions(-)
 create mode 100644 inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 6683c14cb2..d7dc7e2155 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -461,18 +461,6 @@ def execute_trt_engine(
             return results, trt_cuda_graph_cache
 
     else:
-        batch_size = pre_processed_images.shape[0]
-        results = []
-        for output in outputs:
-            output_tensor_shape = engine.get_tensor_shape(output)
-            output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
-            result = torch.empty(
-                (batch_size,) + output_tensor_shape[1:],
-                dtype=output_tensor_type,
-                device=device,
-            )
-            context.set_tensor_address(output, result.data_ptr())
-            results.append(result)
         status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
         if not status:
             raise ModelRuntimeError(
@@ -485,6 +473,17 @@ def execute_trt_engine(
                 message="Failed to set input tensor data pointer during forward pass from the model.",
                 help_url="https://todo",
             )
+        results = []
+        for output in outputs:
+            output_tensor_shape = context.get_tensor_shape(output)
+            output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
+            result = torch.empty(
+                tuple(output_tensor_shape),
+                dtype=output_tensor_type,
+                device=device,
+            )
+            context.set_tensor_address(output, result.data_ptr())
+            results.append(result)
         stream = torch.cuda.Stream(device=device)
         status = context.execute_async_v3(stream_handle=stream.cuda_stream)
         if not status:
@@ -504,23 +503,9 @@ def _capture_cuda_graph(
     input_name: str,
     outputs: List[str],
 ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
-    batch_size = pre_processed_images.shape[0]
-
     input_buffer = torch.empty_like(pre_processed_images, device=device)
     input_buffer.copy_(pre_processed_images)
 
-    output_buffers = []
-    for output in outputs:
-        output_tensor_shape = engine.get_tensor_shape(output)
-        output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
-        output_buffer = torch.empty(
-            (batch_size,) + output_tensor_shape[1:],
-            dtype=output_tensor_type,
-            device=device,
-        )
-        context.set_tensor_address(output, output_buffer.data_ptr())
-        output_buffers.append(output_buffer)
-
     status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
     if not status:
         raise ModelRuntimeError(
@@ -534,6 +519,18 @@ def _capture_cuda_graph(
             help_url="https://todo",
         )
 
+    output_buffers = []
+    for output in outputs:
+        output_tensor_shape = context.get_tensor_shape(output)
+        output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
+        output_buffer = torch.empty(
+            tuple(output_tensor_shape),
+            dtype=output_tensor_type,
+            device=device,
+        )
+        context.set_tensor_address(output, output_buffer.data_ptr())
+        output_buffers.append(output_buffer)
+
     stream = torch.cuda.Stream(device=device)
     with torch.cuda.stream(stream):
         status = context.execute_async_v3(stream_handle=stream.cuda_stream)
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index aef07c3fad..f76d922bae 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -5,7 +5,10 @@
 import torch
 
 from inference_models import Detections, ObjectDetectionModel
-from inference_models.configuration import DEFAULT_DEVICE
+from inference_models.configuration import (
+    DEFAULT_DEVICE,
+    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
+)
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
     CorruptedModelPackageError,
@@ -35,8 +38,10 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphLRUCache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
+    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
 )
 
@@ -160,6 +165,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
         self._lock = threading.Lock()
 
     @property
@@ -180,18 +186,41 @@ def pre_process(
             input_color_format=input_color_format,
         )
 
-    def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(
+        self,
+        pre_processed_images: torch.Tensor,
+        use_cuda_graph: Optional[bool] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if use_cuda_graph is None:
+            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
+
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                return infer_from_trt_engine(
-                    pre_processed_images=pre_processed_images,
-                    trt_config=self._trt_config,
-                    engine=self._engine,
-                    context=self._execution_context,
-                    device=self._device,
-                    input_name=self._input_name,
-                    outputs=self._output_names,
-                )[0]
+                if use_cuda_graph:
+                    results, self._trt_cuda_graph_cache = (
+                        infer_from_trt_engine_with_cudagraph(
+                            pre_processed_images=pre_processed_images,
+                            trt_config=self._trt_config,
+                            engine=self._engine,
+                            context=self._execution_context,
+                            device=self._device,
+                            input_name=self._input_name,
+                            outputs=self._output_names,
+                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
+                        )
+                    )
+                    return results[0]
+                else:
+                    return infer_from_trt_engine(
+                        pre_processed_images=pre_processed_images,
+                        trt_config=self._trt_config,
+                        engine=self._engine,
+                        context=self._execution_context,
+                        device=self._device,
+                        input_name=self._input_name,
+                        outputs=self._output_names,
+                    )[0]
 
     def post_process(
         self,
diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py
index 257b506c93..86fe0533df 100644
--- a/inference_models/tests/integration_tests/models/conftest.py
+++ b/inference_models/tests/integration_tests/models/conftest.py
@@ -184,6 +184,7 @@
 )
 
 RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip"
+YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip"
 
 
 @pytest.fixture(scope="module")
@@ -417,6 +418,14 @@ def rfdetr_nano_t4_trt_package() -> str:
     )
 
 
+@pytest.fixture(scope="module")
+def yolov8n_640_t4_trt_package() -> str:
+    return download_model_package(
+        model_package_zip_url=YOLOV8N_640_T4_TRT_PACKAGE_URL,
+        package_name="yolov8n-640-t4-trt",
+    )
+
+
 @pytest.fixture(scope="module")
 def og_rfdetr_base_weights() -> str:
     package_path = os.path.join(MODELS_DIR, "og-rfdetr-base")
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index e50d6cd030..4768fc9043 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -57,7 +57,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
-def test_trt_outputs_shapes(
+def test_trt_outputs_match_expected_shapes(
     rfdetr_nano_t4_trt_package: str,
     dog_image_numpy: np.ndarray,
 ) -> None:
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
new file mode 100644
index 0000000000..6031df5c6e
--- /dev/null
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -0,0 +1,77 @@
+import numpy as np
+import pytest
+import torch
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
+    yolov8n_640_t4_trt_package: str,
+    dog_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    device = torch.device("cuda:0")
+    model = AutoModel.from_pretrained(
+        model_id_or_path=yolov8n_640_t4_trt_package,
+        device=device,
+    )
+
+    pre_processed_single, _ = model.pre_process(dog_image_numpy)
+    model._trt_cuda_graph_cache = None
+
+    seen_shapes = set()
+    capture_outputs = {}
+    test_sequence = [1, 2, 1, 4, 2, 1, 4, 3, 3]
+
+    for batch_size in test_sequence:
+        batch = pre_processed_single.repeat(batch_size, 1, 1, 1)
+        cache_key = (tuple(batch.shape), batch.dtype, device)
+
+        cache_before = model._trt_cuda_graph_cache
+        cache_size_before = len(cache_before.cache) if cache_before is not None else 0
+
+        output = model.forward(batch, use_cuda_graph=True)
+
+        cache_after = model._trt_cuda_graph_cache
+        assert cache_after is not None
+        cache_size_after = len(cache_after.cache)
+
+        if cache_key not in seen_shapes:
+            assert cache_size_after == cache_size_before + 1
+            seen_shapes.add(cache_key)
+            capture_outputs[cache_key] = output.clone()
+            continue
+
+        assert cache_size_after == cache_size_before
+        assert torch.allclose(capture_outputs[cache_key], output, atol=1e-3)
+
+    assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_cudagraph_output_matches_non_cudagraph_output(
+    yolov8n_640_t4_trt_package: str,
+    dog_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    device = torch.device("cuda:0")
+    model = AutoModel.from_pretrained(
+        model_id_or_path=yolov8n_640_t4_trt_package,
+        device=device,
+    )
+    pre_processed_single, _ = model.pre_process(dog_image_numpy)
+
+    for batch_size in [1, 4]:
+        batch = pre_processed_single.repeat(batch_size, 1, 1, 1)
+
+        no_graph = model.forward(batch, use_cuda_graph=False)
+
+        model._trt_cuda_graph_cache = None
+        capture_graph = model.forward(batch, use_cuda_graph=True)
+        replay_graph = model.forward(batch, use_cuda_graph=True)
+
+        assert torch.allclose(no_graph, capture_graph, atol=1e-3)
+        assert torch.allclose(no_graph, replay_graph, atol=1e-3)

From 6b1d430a1b7da1419ab6ec146e589a37fee8838c Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Feb 2026 00:27:41 +0000
Subject: [PATCH 18/50] add instance seg tests

---
 .../integration_tests/models/conftest.py      |  9 ++++
 .../models/test_rfdetr_seg_predictions_trt.py | 52 +++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py

diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py
index 86fe0533df..452840ca1b 100644
--- a/inference_models/tests/integration_tests/models/conftest.py
+++ b/inference_models/tests/integration_tests/models/conftest.py
@@ -184,6 +184,7 @@
 )
 
 RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip"
+RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-seg-t4-trt.zip"
 YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip"
 
 
@@ -418,6 +419,14 @@ def rfdetr_nano_t4_trt_package() -> str:
     )
 
 
+@pytest.fixture(scope="module")
+def rfdetr_seg_nano_t4_trt_package() -> str:
+    return download_model_package(
+        model_package_zip_url=RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL,
+        package_name="rfdetr-seg-nano-t4-trt",
+    )
+
+
 @pytest.fixture(scope="module")
 def yolov8n_640_t4_trt_package() -> str:
     return download_model_package(
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
new file mode 100644
index 0000000000..c5591aab9e
--- /dev/null
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -0,0 +1,52 @@
+import numpy as np
+import pytest
+import torch
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_cudagraph_output_matches_non_cudagraph_output(
+    rfdetr_seg_nano_t4_trt_package: str,
+    snake_image_numpy: np.ndarray,
+    dog_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path=rfdetr_seg_nano_t4_trt_package,
+        device=torch.device("cuda:0"),
+    )
+
+    pre_processed_1, _ = model.pre_process(snake_image_numpy)
+    pre_processed_2, _ = model.pre_process(dog_image_numpy)
+
+    outputs = []
+    for pre_processed in [pre_processed_1, pre_processed_2]:
+        no_graph = model.forward(pre_processed, use_cuda_graph=False)
+        model._trt_cuda_graph_cache = None
+        capture_graph = model.forward(pre_processed, use_cuda_graph=True)
+        replay_graph = model.forward(pre_processed, use_cuda_graph=True)
+
+        outputs.append((no_graph, capture_graph, replay_graph))
+
+    for image_outputs in outputs:
+        no_graph, capture_graph, replay_graph = image_outputs
+        for result_idx in range(3):
+            assert torch.allclose(
+                no_graph[result_idx],
+                capture_graph[result_idx],
+                atol=1e-6,
+            )
+            assert torch.allclose(
+                no_graph[result_idx],
+                replay_graph[result_idx],
+                atol=1e-6,
+            )
+
+    for execution_branch_idx in range(3):
+        for result_idx in range(3):
+            assert not torch.allclose(
+                outputs[0][execution_branch_idx][result_idx],
+                outputs[1][execution_branch_idx][result_idx],
+                atol=1e-6,
+            )

From 7c233004c69e1fb03517c3413a992480a4da19c1 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Feb 2026 00:32:13 +0000
Subject: [PATCH 19/50] update conftest

---
 inference_models/tests/integration_tests/models/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference_models/tests/integration_tests/models/conftest.py b/inference_models/tests/integration_tests/models/conftest.py
index 452840ca1b..4798ee7830 100644
--- a/inference_models/tests/integration_tests/models/conftest.py
+++ b/inference_models/tests/integration_tests/models/conftest.py
@@ -184,7 +184,7 @@
 )
 
 RFDETR_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-t4-trt.zip"
-RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-nano-seg-t4-trt.zip"
+RFDETR_SEG_NANO_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/rfdetr-seg-nano-t4-trt.zip"
 YOLOV8N_640_T4_TRT_PACKAGE_URL = "https://storage.googleapis.com/roboflow-tests-assets/rf-platform-models/yolov8n-640-t4-trt.zip"
 
 

From a27c80c0092cd1449a82da03674c1d36abdb8693 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Feb 2026 01:07:00 +0000
Subject: [PATCH 20/50] add batch-size-cycling profiling for TRT cudagraphs
 with yolov8

---
 .../profile_yolov8_trt_cudagraphs.py          | 92 +++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py

diff --git a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py
new file mode 100644
index 0000000000..9506b6b1ed
--- /dev/null
+++ b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py
@@ -0,0 +1,92 @@
+import os
+import time
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from inference_models import AutoModel
+
+DEVICE = os.environ.get("DEVICE", "cuda:0")
+CYCLES = int(os.environ.get("CYCLES", "10_000"))
+WARMUP = int(os.environ.get("WARMUP", "50"))
+RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100"))
+
+BATCH_SIZES = [1, 2, 3]
+
+
+def main() -> None:
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path="yolov8n-640",
+        device=torch.device(DEVICE),
+        backend="trt",
+        batch_size=(1, max(BATCH_SIZES)),
+    )
+
+    image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+    pre_processed_single, _ = model.pre_process(image)
+
+    batches = {
+        bs: pre_processed_single.repeat(bs, 1, 1, 1) for bs in BATCH_SIZES
+    }
+
+    # ── Warmup ──────────────────────────────────────────────────────────
+    for _ in range(WARMUP):
+        for batch in batches.values():
+            model.forward(batch, use_cuda_graph=False)
+            model.forward(batch, use_cuda_graph=True)
+
+    bs_label = "/".join(str(bs) for bs in BATCH_SIZES)
+
+    # ── (1) Cycling batch sizes, no CUDA graphs ─────────────────────────
+    print(f"Timing without CUDA graphs, cycling bs={bs_label}...")
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(CYCLES):
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=False)
+    torch.cuda.synchronize()
+    baseline_fps = CYCLES / (time.perf_counter() - start)
+
+    # ── (2) Cycling batch sizes, CUDA graphs with forced recapture ──────
+    print(
+        f"Timing with CUDA graph recapture every iteration, cycling bs={bs_label} "
+        f"({RECAPTURE_CYCLES} iters)..."
+    )
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(RECAPTURE_CYCLES):
+        model._trt_cuda_graph_cache = None
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=True)
+    torch.cuda.synchronize()
+    recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start)
+
+    # ── (3) Cycling batch sizes, CUDA graphs with normal caching ────────
+    model._trt_cuda_graph_cache = None
+    for batch in batches.values():
+        model.forward(batch, use_cuda_graph=True)
+
+    print(f"Timing with CUDA graph cache replay, cycling bs={bs_label}...")
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(CYCLES):
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=True)
+    torch.cuda.synchronize()
+    replay_fps = CYCLES / (time.perf_counter() - start)
+
+    # ── Results ─────────────────────────────────────────────────────────
+    print(f"\n{'='*60}")
+    print(f"  yolov8n-640 TRT — cycling batch sizes {BATCH_SIZES}")
+    print(f"  {CYCLES} iterations (recapture: {RECAPTURE_CYCLES})")
+    print(f"{'='*60}")
+    print(f"  No CUDA graphs:          {baseline_fps:>8.1f} fwd/s")
+    print(f"  CUDA graph recapture:    {recapture_fps:>8.1f} fwd/s  ({recapture_fps / baseline_fps:.2f}x)")
+    print(f"  CUDA graph replay:       {replay_fps:>8.1f} fwd/s  ({replay_fps / baseline_fps:.2f}x)")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()

From 212b2d64231b2f4f01aa883077097037fa9afaf6 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Feb 2026 01:20:41 +0000
Subject: [PATCH 21/50] fix failing test

---
 .../inference_models/models/common/trt.py     | 31 ++++++++++---------
 .../rfdetr_instance_segmentation_trt.py       |  1 -
 .../rfdetr/rfdetr_object_detection_trt.py     |  1 -
 .../yolov8/yolov8_object_detection_trt.py     |  1 -
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index d7dc7e2155..8d4f1bf260 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -65,6 +65,7 @@ class TRTCudaGraphState:
     cuda_stream: torch.cuda.Stream
     input_buffer: torch.Tensor
     output_buffers: List[torch.Tensor]
+    execution_context: trt.IExecutionContext
 
 
 class TRTCudaGraphLRUCache:
@@ -292,7 +293,6 @@ def infer_from_trt_engine_with_cudagraph(
     pre_processed_images: torch.Tensor,
     trt_config: TRTConfig,
     engine: trt.ICudaEngine,
-    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
@@ -307,7 +307,6 @@ def infer_from_trt_engine_with_cudagraph(
         pre_processed_images: Preprocessed input tensor on CUDA device.
         trt_config: TensorRT configuration object.
         engine: TensorRT CUDA engine (ICudaEngine).
-        context: TensorRT execution context (IExecutionContext).
         device: PyTorch CUDA device.
         input_name: Name of the input tensor in the TensorRT engine.
         outputs: List of output tensor names.
@@ -321,7 +320,7 @@ def infer_from_trt_engine_with_cudagraph(
         return infer_from_trt_engine_with_batch_size_boundaries(
             pre_processed_images=pre_processed_images,
             engine=engine,
-            context=context,
+            context=None, # the graph cache has its own contexts
             device=device,
             input_name=input_name,
             outputs=outputs,
@@ -333,7 +332,7 @@ def infer_from_trt_engine_with_cudagraph(
     return infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
-        context=context,
+        context=None, # the graph cache has its own contexts
         device=device,
         input_name=input_name,
         outputs=outputs,
@@ -442,7 +441,6 @@ def execute_trt_engine(
             results, trt_cuda_graph = _capture_cuda_graph(
                 pre_processed_images=pre_processed_images,
                 engine=engine,
-                context=context,
                 device=device,
                 input_name=input_name,
                 outputs=outputs,
@@ -498,21 +496,25 @@ def execute_trt_engine(
 def _capture_cuda_graph(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
-    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
 ) -> Tuple[List[torch.Tensor], TRTCudaGraphState]:
+    # Each CUDA graph needs its own execution context. Sharing a single context
+    # across graphs for different input shapes causes TRT to reallocate internal
+    # workspace buffers, invalidating GPU addresses baked into earlier graphs.
+    graph_context = engine.create_execution_context()
+
     input_buffer = torch.empty_like(pre_processed_images, device=device)
     input_buffer.copy_(pre_processed_images)
 
-    status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
+    status = graph_context.set_input_shape(input_name, tuple(pre_processed_images.shape))
     if not status:
         raise ModelRuntimeError(
             message="Failed to set TRT model input shape during CUDA graph capture.",
             help_url="https://todo",
         )
-    status = context.set_tensor_address(input_name, input_buffer.data_ptr())
+    status = graph_context.set_tensor_address(input_name, input_buffer.data_ptr())
     if not status:
         raise ModelRuntimeError(
             message="Failed to set input tensor data pointer during CUDA graph capture.",
@@ -521,19 +523,19 @@ def _capture_cuda_graph(
 
     output_buffers = []
     for output in outputs:
-        output_tensor_shape = context.get_tensor_shape(output)
+        output_tensor_shape = graph_context.get_tensor_shape(output)
         output_tensor_type = trt_dtype_to_torch(engine.get_tensor_dtype(output))
         output_buffer = torch.empty(
             tuple(output_tensor_shape),
             dtype=output_tensor_type,
             device=device,
         )
-        context.set_tensor_address(output, output_buffer.data_ptr())
+        graph_context.set_tensor_address(output, output_buffer.data_ptr())
         output_buffers.append(output_buffer)
 
     stream = torch.cuda.Stream(device=device)
     with torch.cuda.stream(stream):
-        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        status = graph_context.execute_async_v3(stream_handle=stream.cuda_stream)
         if not status:
             raise ModelRuntimeError(
                 message="Failed to execute TRT model warmup before CUDA graph capture.",
@@ -543,7 +545,7 @@ def _capture_cuda_graph(
 
     cuda_graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(cuda_graph, stream=stream):
-        status = context.execute_async_v3(stream_handle=stream.cuda_stream)
+        status = graph_context.execute_async_v3(stream_handle=stream.cuda_stream)
         if not status:
             raise ModelRuntimeError(
                 message="Failed to capture CUDA graph from TRT model execution.",
@@ -553,14 +555,15 @@ def _capture_cuda_graph(
         results = [buf.clone() for buf in output_buffers]
     stream.synchronize()
 
-    trt_cuda_graph_cache = TRTCudaGraphState(
+    trt_cuda_graph_state = TRTCudaGraphState(
         cuda_graph=cuda_graph,
         cuda_stream=stream,
         input_buffer=input_buffer,
         output_buffers=output_buffers,
+        execution_context=graph_context,
     )
 
-    return results, trt_cuda_graph_cache
+    return results, trt_cuda_graph_state
 
 
 def trt_dtype_to_torch(trt_dtype):
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index dba576f7b3..745e2c5cd9 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -215,7 +215,6 @@ def forward(
                             pre_processed_images=pre_processed_images,
                             trt_config=self._trt_config,
                             engine=self._engine,
-                            context=self._execution_context,
                             device=self._device,
                             input_name=self._input_name,
                             outputs=self._output_names,
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index 4503454e00..d6ac66e84e 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -218,7 +218,6 @@ def forward(
                             pre_processed_images=pre_processed_images,
                             trt_config=self._trt_config,
                             engine=self._engine,
-                            context=self._execution_context,
                             device=self._device,
                             input_name=self._input_name,
                             outputs=self._output_names,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index f76d922bae..ae0cda31fa 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -203,7 +203,6 @@ def forward(
                             pre_processed_images=pre_processed_images,
                             trt_config=self._trt_config,
                             engine=self._engine,
-                            context=self._execution_context,
                             device=self._device,
                             input_name=self._input_name,
                             outputs=self._output_names,

From 4204f4f5c7e33399636f9ad72d9461a55516b42c Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 00:13:08 +0000
Subject: [PATCH 22/50] first stab at responding to Pawel's feedback

---
 .../inference_models/models/common/trt.py     | 159 +++++++-----------
 .../rfdetr_instance_segmentation_trt.py       |  43 ++---
 .../rfdetr/rfdetr_object_detection_trt.py     |  43 ++---
 .../yolov8/yolov8_object_detection_trt.py     |  42 ++---
 4 files changed, 111 insertions(+), 176 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 8d4f1bf260..1e6ca5c7d8 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -164,16 +164,24 @@ def infer_from_trt_engine(
     pre_processed_images: torch.Tensor,
     trt_config: TRTConfig,
     engine: trt.ICudaEngine,
-    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
+    context: Optional[trt.IExecutionContext] = None,
+    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
 ) -> List[torch.Tensor]:
-    """Run inference using a TensorRT engine.
+    """Run inference using a TensorRT engine, optionally with CUDA graph acceleration.
+
+    Executes inference on preprocessed images using a TensorRT engine. Handles both
+    static and dynamic batch sizes, automatically splitting large batches if needed.
 
-    Executes inference on preprocessed images using a TensorRT engine and execution
-    context. Handles both static and dynamic batch sizes, automatically splitting
-    large batches if needed.
+    When ``trt_cuda_graph_cache`` is provided, CUDA graphs are captured and replayed
+    for improved performance on repeated inference with the same input shape. Each
+    graph is keyed by (shape, dtype, device) and stored in the cache. The cache
+    itself must be created by the caller (typically in the model class).
+
+    When ``trt_cuda_graph_cache`` is ``None``, inference runs through the standard
+    TRT execution path using the provided ``context``.
 
     Args:
         pre_processed_images: Preprocessed input tensor on CUDA device.
@@ -185,6 +193,8 @@ def infer_from_trt_engine(
         engine: TensorRT CUDA engine (ICudaEngine) to use for inference.
 
         context: TensorRT execution context (IExecutionContext) for running inference.
+            Required when ``trt_cuda_graph_cache`` is ``None``. Ignored when using
+            CUDA graphs (each cached graph owns its own execution context).
 
         device: PyTorch CUDA device to use for inference.
 
@@ -192,12 +202,15 @@ def infer_from_trt_engine(
 
         outputs: List of output tensor names to retrieve from the engine.
 
+        trt_cuda_graph_cache: Optional CUDA graph cache. When provided, CUDA graphs
+            are used for inference. When ``None``, standard TRT execution is used.
+
     Returns:
         List of output tensors from the TensorRT engine, in the order specified
         by the outputs parameter.
 
     Examples:
-        Run TensorRT inference:
+        Run TensorRT inference (standard path):
 
         >>> from inference_models.developer_tools import (
         ...     load_trt_model,
@@ -228,7 +241,7 @@ def infer_from_trt_engine(
         ...     context=context,
         ...     device=torch.device("cuda:0"),
         ...     input_name=inputs[0],
-        ...     outputs=outputs
+        ...     outputs=outputs,
         ... )
 
         Handle large batches:
@@ -243,10 +256,25 @@ def infer_from_trt_engine(
         ...     context=context,
         ...     device=torch.device("cuda:0"),
         ...     input_name=inputs[0],
-        ...     outputs=outputs
+        ...     outputs=outputs,
         ... )
         >>> # Results are automatically concatenated
 
+        Run with CUDA graph acceleration:
+
+        >>> from inference_models.models.common.trt import TRTCudaGraphLRUCache
+        >>> cache = TRTCudaGraphLRUCache(capacity=16)
+        >>>
+        >>> results = infer_from_trt_engine(
+        ...     pre_processed_images=images,
+        ...     trt_config=trt_config,
+        ...     engine=engine,
+        ...     device=torch.device("cuda:0"),
+        ...     input_name=inputs[0],
+        ...     outputs=outputs,
+        ...     trt_cuda_graph_cache=cache,
+        ... )
+
     Note:
         - Requires TensorRT and PyCUDA to be installed
         - Input must be on CUDA device
@@ -261,100 +289,35 @@ def infer_from_trt_engine(
         - `get_trt_engine_inputs_and_outputs()`: Get engine tensor names
     """
     if trt_config.static_batch_size is not None:
-        results, _ = infer_from_trt_engine_with_batch_size_boundaries(
-            pre_processed_images=pre_processed_images,
-            engine=engine,
-            context=context,
-            device=device,
-            input_name=input_name,
-            outputs=outputs,
-            min_batch_size=trt_config.static_batch_size,
-            max_batch_size=trt_config.static_batch_size,
-            use_cuda_graph=False,
-            trt_cuda_graph_cache=None,
-        )
-        return results
-    results, _ = infer_from_trt_engine_with_batch_size_boundaries(
+        min_batch_size = trt_config.static_batch_size
+        max_batch_size = trt_config.static_batch_size
+    else:
+        min_batch_size = trt_config.dynamic_batch_size_min
+        max_batch_size = trt_config.dynamic_batch_size_max
+    return _infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
         context=context,
         device=device,
         input_name=input_name,
         outputs=outputs,
-        min_batch_size=trt_config.dynamic_batch_size_min,
-        max_batch_size=trt_config.dynamic_batch_size_max,
-        use_cuda_graph=False,
-        trt_cuda_graph_cache=None,
-    )
-    return results
-
-
-def infer_from_trt_engine_with_cudagraph(
-    pre_processed_images: torch.Tensor,
-    trt_config: TRTConfig,
-    engine: trt.ICudaEngine,
-    device: torch.device,
-    input_name: str,
-    outputs: List[str],
-    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
-    """Run inference using a TensorRT engine with CUDA graph support.
-
-    Similar to `infer_from_trt_engine`, but captures and replays CUDA graphs for
-    improved performance on repeated inference with the same input shape.
-
-    Args:
-        pre_processed_images: Preprocessed input tensor on CUDA device.
-        trt_config: TensorRT configuration object.
-        engine: TensorRT CUDA engine (ICudaEngine).
-        device: PyTorch CUDA device.
-        input_name: Name of the input tensor in the TensorRT engine.
-        outputs: List of output tensor names.
-        trt_cuda_graph_cache: Optional state from a previous call for graph replay.
-
-    Returns:
-        Tuple of (results, trt_cuda_graph_cache) where results is the list of
-        output tensors and trt_cuda_graph_cache can be passed to subsequent calls.
-    """
-    if trt_config.static_batch_size is not None:
-        return infer_from_trt_engine_with_batch_size_boundaries(
-            pre_processed_images=pre_processed_images,
-            engine=engine,
-            context=None, # the graph cache has its own contexts
-            device=device,
-            input_name=input_name,
-            outputs=outputs,
-            min_batch_size=trt_config.static_batch_size,
-            max_batch_size=trt_config.static_batch_size,
-            use_cuda_graph=True,
-            trt_cuda_graph_cache=trt_cuda_graph_cache,
-        )
-    return infer_from_trt_engine_with_batch_size_boundaries(
-        pre_processed_images=pre_processed_images,
-        engine=engine,
-        context=None, # the graph cache has its own contexts
-        device=device,
-        input_name=input_name,
-        outputs=outputs,
-        min_batch_size=trt_config.dynamic_batch_size_min,
-        max_batch_size=trt_config.dynamic_batch_size_max,
-        use_cuda_graph=True,
+        min_batch_size=min_batch_size,
+        max_batch_size=max_batch_size,
         trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
 
-def infer_from_trt_engine_with_batch_size_boundaries(
+def _infer_from_trt_engine_with_batch_size_boundaries(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
-    context: trt.IExecutionContext,
+    context: Optional[trt.IExecutionContext],
     device: torch.device,
     input_name: str,
     outputs: List[str],
     min_batch_size: int,
     max_batch_size: int,
-    use_cuda_graph: bool = False,
     trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
+) -> List[torch.Tensor]:
     if pre_processed_images.shape[0] <= max_batch_size:
         reminder = min_batch_size - pre_processed_images.shape[0]
         if reminder > 0:
@@ -369,19 +332,18 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results, trt_cuda_graph_cache = execute_trt_engine(
+        results = _execute_trt_engine(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
             device=device,
             input_name=input_name,
             outputs=outputs,
-            use_cuda_graph=use_cuda_graph,
             trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
-        return results, trt_cuda_graph_cache
+        return results
     all_results = []
     for _ in outputs:
         all_results.append([])
@@ -400,37 +362,32 @@ def infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results, trt_cuda_graph_cache = execute_trt_engine(
+        results = _execute_trt_engine(
             pre_processed_images=batch,
             engine=engine,
             context=context,
             device=device,
             input_name=input_name,
             outputs=outputs,
-            use_cuda_graph=use_cuda_graph,
             trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
         if reminder > 0:
             results = [r[:-reminder] for r in results]
         for partial_result, all_result_element in zip(results, all_results):
             all_result_element.append(partial_result)
-    return [torch.cat(e, dim=0).contiguous() for e in all_results], trt_cuda_graph_cache
+    return [torch.cat(e, dim=0).contiguous() for e in all_results]
 
 
-def execute_trt_engine(
+def _execute_trt_engine(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
-    context: trt.IExecutionContext,
+    context: Optional[trt.IExecutionContext],
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    use_cuda_graph: bool = False,
     trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
-) -> Tuple[List[torch.Tensor], Optional[TRTCudaGraphLRUCache]]:
-    if use_cuda_graph:
-        if trt_cuda_graph_cache is None:
-            trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=64)
-
+) -> List[torch.Tensor]:
+    if trt_cuda_graph_cache is not None:
         input_shape = tuple(pre_processed_images.shape)
         input_dtype = pre_processed_images.dtype
         cache_key = (input_shape, input_dtype, device)
@@ -446,7 +403,7 @@ def execute_trt_engine(
                 outputs=outputs,
             )
             trt_cuda_graph_cache[cache_key] = trt_cuda_graph
-            return results, trt_cuda_graph_cache
+            return results
 
         else:
             trt_cuda_graph_state = trt_cuda_graph_cache[cache_key]
@@ -456,7 +413,7 @@ def execute_trt_engine(
                 trt_cuda_graph_state.cuda_graph.replay()
                 results = [buf.clone() for buf in trt_cuda_graph_state.output_buffers]
             stream.synchronize()
-            return results, trt_cuda_graph_cache
+            return results
 
     else:
         status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
@@ -490,7 +447,7 @@ def execute_trt_engine(
                 help_url="https://todo",
             )
         stream.synchronize()
-        return results, None
+        return results
 
 
 def _capture_cuda_graph(
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 745e2c5cd9..22ffdc9a22 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -33,11 +33,10 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphLRUCache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
-    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
-    TRTCudaGraphLRUCache,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -82,6 +81,7 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        cuda_graph_cache_capacity: int = 64,
         **kwargs,
     ) -> "RFDetrForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -150,6 +150,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
         )
 
     def __init__(
@@ -164,6 +165,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        cuda_graph_cache_capacity: int = 64,
     ):
         self._engine = engine
         self._input_name = input_name
@@ -175,7 +177,9 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
+        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+            capacity=cuda_graph_cache_capacity,
+        )
         self._lock = threading.Lock()
 
     @property
@@ -207,30 +211,19 @@ def forward(
         if use_cuda_graph is None:
             use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
 
+        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                if use_cuda_graph:
-                    (detections, labels, masks), self._trt_cuda_graph_cache = (
-                        infer_from_trt_engine_with_cudagraph(
-                            pre_processed_images=pre_processed_images,
-                            trt_config=self._trt_config,
-                            engine=self._engine,
-                            device=self._device,
-                            input_name=self._input_name,
-                            outputs=self._output_names,
-                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
-                        )
-                    )
-                else:
-                    detections, labels, masks = infer_from_trt_engine(
-                        pre_processed_images=pre_processed_images,
-                        trt_config=self._trt_config,
-                        engine=self._engine,
-                        context=self._execution_context,
-                        device=self._device,
-                        input_name=self._input_name,
-                        outputs=self._output_names,
-                    )
+                detections, labels, masks = infer_from_trt_engine(
+                    pre_processed_images=pre_processed_images,
+                    trt_config=self._trt_config,
+                    engine=self._engine,
+                    context=self._execution_context if not use_cuda_graph else None,
+                    device=self._device,
+                    input_name=self._input_name,
+                    outputs=self._output_names,
+                    trt_cuda_graph_cache=cache,
+                )
                 return detections, labels, masks
 
     def post_process(
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index d6ac66e84e..5b163da87c 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -36,11 +36,10 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphLRUCache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
-    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
-    TRTCudaGraphLRUCache,
 )
 from inference_models.models.rfdetr.class_remapping import (
     ClassesReMapping,
@@ -82,6 +81,7 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        cuda_graph_cache_capacity: int = 64,
         **kwargs,
     ) -> "RFDetrForObjectDetectionTRT":
         if device.type != "cuda":
@@ -155,6 +155,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
         )
 
     def __init__(
@@ -169,6 +170,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        cuda_graph_cache_capacity: int = 64,
     ):
         self._engine = engine
         self._input_name = input_name
@@ -180,7 +182,9 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
+        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+            capacity=cuda_graph_cache_capacity,
+        )
         self._lock = threading.Lock()
 
     @property
@@ -210,30 +214,19 @@ def forward(
         if use_cuda_graph is None:
             use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
 
+        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                if use_cuda_graph:
-                    (detections, labels), self._trt_cuda_graph_cache = (
-                        infer_from_trt_engine_with_cudagraph(
-                            pre_processed_images=pre_processed_images,
-                            trt_config=self._trt_config,
-                            engine=self._engine,
-                            device=self._device,
-                            input_name=self._input_name,
-                            outputs=self._output_names,
-                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
-                        )
-                    )
-                else:
-                    detections, labels = infer_from_trt_engine(
-                        pre_processed_images=pre_processed_images,
-                        trt_config=self._trt_config,
-                        engine=self._engine,
-                        context=self._execution_context,
-                        device=self._device,
-                        input_name=self._input_name,
-                        outputs=self._output_names,
-                    )
+                detections, labels = infer_from_trt_engine(
+                    pre_processed_images=pre_processed_images,
+                    trt_config=self._trt_config,
+                    engine=self._engine,
+                    context=self._execution_context if not use_cuda_graph else None,
+                    device=self._device,
+                    input_name=self._input_name,
+                    outputs=self._output_names,
+                    trt_cuda_graph_cache=cache,
+                )
                 return detections, labels
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index ae0cda31fa..3794d8ee1e 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -41,7 +41,6 @@
     TRTCudaGraphLRUCache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
-    infer_from_trt_engine_with_cudagraph,
     load_trt_model,
 )
 
@@ -77,6 +76,7 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        cuda_graph_cache_capacity: int = 64,
         **kwargs,
     ) -> "YOLOv8ForObjectDetectionTRT":
         if device.type != "cuda":
@@ -142,6 +142,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
         )
 
     def __init__(
@@ -155,6 +156,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        cuda_graph_cache_capacity: int = 64,
     ):
         self._engine = engine
         self._input_name = input_name
@@ -165,7 +167,9 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
-        self._trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None
+        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+            capacity=cuda_graph_cache_capacity,
+        )
         self._lock = threading.Lock()
 
     @property
@@ -195,31 +199,19 @@ def forward(
         if use_cuda_graph is None:
             use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
 
+        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
-                if use_cuda_graph:
-                    results, self._trt_cuda_graph_cache = (
-                        infer_from_trt_engine_with_cudagraph(
-                            pre_processed_images=pre_processed_images,
-                            trt_config=self._trt_config,
-                            engine=self._engine,
-                            device=self._device,
-                            input_name=self._input_name,
-                            outputs=self._output_names,
-                            trt_cuda_graph_cache=self._trt_cuda_graph_cache,
-                        )
-                    )
-                    return results[0]
-                else:
-                    return infer_from_trt_engine(
-                        pre_processed_images=pre_processed_images,
-                        trt_config=self._trt_config,
-                        engine=self._engine,
-                        context=self._execution_context,
-                        device=self._device,
-                        input_name=self._input_name,
-                        outputs=self._output_names,
-                    )[0]
+                return infer_from_trt_engine(
+                    pre_processed_images=pre_processed_images,
+                    trt_config=self._trt_config,
+                    engine=self._engine,
+                    context=self._execution_context if not use_cuda_graph else None,
+                    device=self._device,
+                    input_name=self._input_name,
+                    outputs=self._output_names,
+                    trt_cuda_graph_cache=cache,
+                )[0]
 
     def post_process(
         self,

From 51f191ced33f9d725e1d87e8bb7081ce0e9e5a10 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 18:57:10 +0000
Subject: [PATCH 23/50] working on memory profiling for cudagraphs

---
 .../profiling/profile_cudagraph_vram.py       | 207 ++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 inference_models/development/profiling/profile_cudagraph_vram.py

diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py
new file mode 100644
index 0000000000..0c0b3ddfdd
--- /dev/null
+++ b/inference_models/development/profiling/profile_cudagraph_vram.py
@@ -0,0 +1,207 @@
+"""Profile VRAM usage as the number of cached CUDA graphs grows.
+
+Loads yolov8n-640 as a TRT model with dynamic batch size, then runs forward
+passes with varying batch sizes (in shuffled order) to force new graph captures.
+
+Measures VRAM two ways after each capture:
+  - "Tensor bytes": directly summed from input_buffer + output_buffers in the cache.
+  - "Driver bytes": total GPU memory used, via torch.cuda.mem_get_info() which
+    queries the NVIDIA driver. This captures opaque allocations (TRT execution
+    contexts, CUDA graph objects, streams, internal workspace) that are invisible
+    to PyTorch's allocator.
+
+The difference (driver - tensor - baseline) isolates the opaque overhead.
+
+Example invocation:
+    python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 32
+
+    python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 16 --output vram.png
+"""
+
+import argparse
+import gc
+import random
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from inference_models import AutoModel
+from inference_models.models.common.trt import TRTCudaGraphLRUCache, TRTCudaGraphState
+
+MODEL_ID = "yolov8n-640"
+
+
+def graph_state_tensor_bytes(state: TRTCudaGraphState) -> int:
+    total = state.input_buffer.nbytes
+    for buf in state.output_buffers:
+        total += buf.nbytes
+    return total
+
+
+def cache_total_tensor_bytes(cache: TRTCudaGraphLRUCache) -> int:
+    total = 0
+    for state in cache.cache.values():
+        total += graph_state_tensor_bytes(state)
+    return total
+
+
+def driver_used_bytes(device: torch.device) -> int:
+    free, total = torch.cuda.mem_get_info(device)
+    return total - free
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Profile VRAM usage vs. number of cached CUDA graphs (varying batch size).",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+    )
+    parser.add_argument(
+        "--max-batch-size",
+        type=int,
+        default=16,
+        help="Largest batch size to test. Each batch size from 1..max creates a new graph.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=None,
+        help="Path to save the plot image. Defaults to 'vram_yolov8n-640.png'.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    device = torch.device(args.device)
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path=MODEL_ID,
+        device=device,
+        backend="trt",
+        batch_size=(1, args.max_batch_size),
+        cuda_graph_cache_capacity=args.max_batch_size + 10,
+    )
+
+    image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8)
+    single_preprocessed, _ = model.pre_process(image)
+
+    model.forward(single_preprocessed, use_cuda_graph=False)
+    torch.cuda.synchronize(device)
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    baseline_driver_bytes = driver_used_bytes(device)
+
+    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+        capacity=args.max_batch_size + 10,
+    )
+
+    batch_size_order = list(range(1, args.max_batch_size + 1))
+    random.Random(42).shuffle(batch_size_order)
+
+    batch_sizes = []
+    cumulative_tensor_mb = []
+    cumulative_driver_mb = []
+    per_graph_tensor_mb = []
+    per_graph_driver_mb = []
+
+    prev_tensor_bytes = 0
+    prev_driver_bytes = baseline_driver_bytes
+    for i, bs in enumerate(batch_size_order):
+        batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
+        model.forward(batched, use_cuda_graph=True)
+        torch.cuda.synchronize(device)
+
+        tensor_bytes = cache_total_tensor_bytes(model._trt_cuda_graph_cache)
+        drv_bytes = driver_used_bytes(device)
+
+        tensor_delta = tensor_bytes - prev_tensor_bytes
+        driver_delta = drv_bytes - prev_driver_bytes
+
+        batch_sizes.append(bs)
+        cumulative_tensor_mb.append(tensor_bytes / (1024 ** 2))
+        cumulative_driver_mb.append((drv_bytes - baseline_driver_bytes) / (1024 ** 2))
+        per_graph_tensor_mb.append(tensor_delta / (1024 ** 2))
+        per_graph_driver_mb.append(driver_delta / (1024 ** 2))
+
+        prev_tensor_bytes = tensor_bytes
+        prev_driver_bytes = drv_bytes
+        print(
+            f"[{i + 1}/{args.max_batch_size}] "
+            f"bs={bs:>2d} | "
+            f"tensors: {tensor_bytes / (1024 ** 2):>7.1f} MB (+{tensor_delta / (1024 ** 2):>6.1f}) | "
+            f"driver:  {(drv_bytes - baseline_driver_bytes) / (1024 ** 2):>7.1f} MB (+{driver_delta / (1024 ** 2):>6.1f})"
+        )
+
+    output_path = Path(args.output) if args.output else Path(f"vram_{MODEL_ID}.png")
+
+    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
+    fig.suptitle(
+        f"CUDA Graph Cache VRAM (varying batch size) — {MODEL_ID}",
+        fontsize=14,
+    )
+
+    capture_order = list(range(1, len(batch_sizes) + 1))
+    bar_width = 0.35
+
+    ax_cum = axes[0]
+    x_cum = np.arange(len(capture_order))
+    ax_cum.bar(
+        x_cum - bar_width / 2, cumulative_driver_mb, bar_width,
+        color="steelblue", label="Driver-level (total GPU)",
+    )
+    ax_cum.bar(
+        x_cum + bar_width / 2, cumulative_tensor_mb, bar_width,
+        color="darkorange", label="Cache tensors only",
+    )
+    ax_cum.set_ylabel("Cumulative VRAM above baseline (MB)")
+    ax_cum.set_xlabel("Number of Cached Graphs (capture order)")
+    ax_cum.set_xticks(x_cum)
+    ax_cum.set_xticklabels(
+        [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)],
+        fontsize=7,
+    )
+    ax_cum.legend()
+
+    sorted_indices = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k])
+    sorted_bs = [batch_sizes[k] for k in sorted_indices]
+    sorted_driver = [per_graph_driver_mb[k] for k in sorted_indices]
+    sorted_tensor = [per_graph_tensor_mb[k] for k in sorted_indices]
+
+    ax_pg = axes[1]
+    x_pg = np.arange(len(sorted_bs))
+    ax_pg.bar(
+        x_pg - bar_width / 2, sorted_driver, bar_width,
+        color="steelblue", label="Driver-level (total GPU)",
+    )
+    ax_pg.bar(
+        x_pg + bar_width / 2, sorted_tensor, bar_width,
+        color="darkorange", label="Cache tensors only",
+    )
+    ax_pg.set_ylabel("Per-Graph VRAM (MB)")
+    ax_pg.set_xlabel("Batch Size")
+    ax_pg.set_xticks(x_pg)
+    ax_pg.set_xticklabels([str(bs) for bs in sorted_bs])
+    ax_pg.legend()
+
+    plt.tight_layout()
+    fig.savefig(output_path, dpi=150)
+    print(f"\nPlot saved to {output_path}")
+
+    total_tensor = prev_tensor_bytes / (1024 ** 2)
+    total_driver = (prev_driver_bytes - baseline_driver_bytes) / (1024 ** 2)
+    n = len(batch_sizes)
+    print(f"\nAfter {n} graphs:")
+    print(f"  Cache tensor VRAM:  {total_tensor:.1f} MB (avg {total_tensor / n:.1f} MB/graph)")
+    print(f"  Driver-level VRAM:  {total_driver:.1f} MB (avg {total_driver / n:.1f} MB/graph)")
+    print(f"  Opaque overhead:    {total_driver - total_tensor:.1f} MB (avg {(total_driver - total_tensor) / n:.1f} MB/graph)")
+
+
+if __name__ == "__main__":
+    main()

From a80a5727897508a6ae34fb4ce3ecc2f2daa234f4 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 19:29:49 +0000
Subject: [PATCH 24/50] simplify memory profiling script

---
 .../profiling/profile_cudagraph_vram.py       | 192 +++++++-----------
 1 file changed, 77 insertions(+), 115 deletions(-)

diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py
index 0c0b3ddfdd..412458e114 100644
--- a/inference_models/development/profiling/profile_cudagraph_vram.py
+++ b/inference_models/development/profiling/profile_cudagraph_vram.py
@@ -1,25 +1,20 @@
-"""Profile VRAM usage as the number of cached CUDA graphs grows.
+"""Profile GPU and CPU memory usage as CUDA graphs are cached.
 
-Loads yolov8n-640 as a TRT model with dynamic batch size, then runs forward
-passes with varying batch sizes (in shuffled order) to force new graph captures.
-
-Measures VRAM two ways after each capture:
-  - "Tensor bytes": directly summed from input_buffer + output_buffers in the cache.
-  - "Driver bytes": total GPU memory used, via torch.cuda.mem_get_info() which
-    queries the NVIDIA driver. This captures opaque allocations (TRT execution
-    contexts, CUDA graph objects, streams, internal workspace) that are invisible
-    to PyTorch's allocator.
-
-The difference (driver - tensor - baseline) isolates the opaque overhead.
+Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes
+with batch sizes 1-16 in a deterministic random order, and after each capture
+records both GPU VRAM (driver-level) and process CPU RSS. Produces a two-panel
+plot: cumulative memory over capture order, and per-graph delta sorted by batch
+size.
 
 Example invocation:
-    python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 32
+    python profile_cudagraph_vram.py --device cuda:0
 
-    python profile_cudagraph_vram.py --device cuda:0 --max-batch-size 16 --output vram.png
+    python profile_cudagraph_vram.py --device cuda:0 --shuffle --max-batch-size 32 --output mem.png
 """
 
 import argparse
 import gc
+import os
 import random
 from pathlib import Path
 
@@ -28,51 +23,31 @@
 import torch
 
 from inference_models import AutoModel
-from inference_models.models.common.trt import TRTCudaGraphLRUCache, TRTCudaGraphState
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
 MODEL_ID = "yolov8n-640"
+MB = 1024 ** 2
 
 
-def graph_state_tensor_bytes(state: TRTCudaGraphState) -> int:
-    total = state.input_buffer.nbytes
-    for buf in state.output_buffers:
-        total += buf.nbytes
-    return total
-
-
-def cache_total_tensor_bytes(cache: TRTCudaGraphLRUCache) -> int:
-    total = 0
-    for state in cache.cache.values():
-        total += graph_state_tensor_bytes(state)
-    return total
-
-
-def driver_used_bytes(device: torch.device) -> int:
+def gpu_used_bytes(device: torch.device) -> int:
     free, total = torch.cuda.mem_get_info(device)
     return total - free
 
 
+def cpu_rss_bytes() -> int:
+    with open(f"/proc/{os.getpid()}/statm") as f:
+        pages = int(f.read().split()[1])
+    return pages * os.sysconf("SC_PAGE_SIZE")
+
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="Profile VRAM usage vs. number of cached CUDA graphs (varying batch size).",
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="cuda:0",
-    )
-    parser.add_argument(
-        "--max-batch-size",
-        type=int,
-        default=16,
-        help="Largest batch size to test. Each batch size from 1..max creates a new graph.",
-    )
-    parser.add_argument(
-        "--output",
-        type=str,
-        default=None,
-        help="Path to save the plot image. Defaults to 'vram_yolov8n-640.png'.",
+        description="Profile GPU + CPU memory vs. number of cached CUDA graphs.",
     )
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--max-batch-size", type=int, default=16)
+    parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order (deterministic seed).")
+    parser.add_argument("--output", type=str, default=None)
     return parser.parse_args()
 
 
@@ -92,115 +67,102 @@ def main() -> None:
     single_preprocessed, _ = model.pre_process(image)
 
     model.forward(single_preprocessed, use_cuda_graph=False)
-    torch.cuda.synchronize(device)
     gc.collect()
+    torch.cuda.synchronize(device)
     torch.cuda.empty_cache()
 
-    baseline_driver_bytes = driver_used_bytes(device)
+    baseline_gpu = gpu_used_bytes(device)
+    baseline_cpu = cpu_rss_bytes()
 
     model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
         capacity=args.max_batch_size + 10,
     )
 
     batch_size_order = list(range(1, args.max_batch_size + 1))
-    random.Random(42).shuffle(batch_size_order)
+    if args.shuffle:
+        random.Random(42).shuffle(batch_size_order)
 
     batch_sizes = []
-    cumulative_tensor_mb = []
-    cumulative_driver_mb = []
-    per_graph_tensor_mb = []
-    per_graph_driver_mb = []
+    cumulative_gpu_mb = []
+    cumulative_cpu_mb = []
+    delta_gpu_mb = []
+    delta_cpu_mb = []
+
+    prev_gpu = baseline_gpu
+    prev_cpu = baseline_cpu
 
-    prev_tensor_bytes = 0
-    prev_driver_bytes = baseline_driver_bytes
     for i, bs in enumerate(batch_size_order):
         batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
-        model.forward(batched, use_cuda_graph=True)
+        output = model.forward(batched, use_cuda_graph=True)
+        del output
+        gc.collect()
         torch.cuda.synchronize(device)
 
-        tensor_bytes = cache_total_tensor_bytes(model._trt_cuda_graph_cache)
-        drv_bytes = driver_used_bytes(device)
-
-        tensor_delta = tensor_bytes - prev_tensor_bytes
-        driver_delta = drv_bytes - prev_driver_bytes
+        gpu = gpu_used_bytes(device)
+        cpu = cpu_rss_bytes()
 
         batch_sizes.append(bs)
-        cumulative_tensor_mb.append(tensor_bytes / (1024 ** 2))
-        cumulative_driver_mb.append((drv_bytes - baseline_driver_bytes) / (1024 ** 2))
-        per_graph_tensor_mb.append(tensor_delta / (1024 ** 2))
-        per_graph_driver_mb.append(driver_delta / (1024 ** 2))
+        cumulative_gpu_mb.append((gpu - baseline_gpu) / MB)
+        cumulative_cpu_mb.append((cpu - baseline_cpu) / MB)
+        delta_gpu_mb.append((gpu - prev_gpu) / MB)
+        delta_cpu_mb.append((cpu - prev_cpu) / MB)
 
-        prev_tensor_bytes = tensor_bytes
-        prev_driver_bytes = drv_bytes
         print(
-            f"[{i + 1}/{args.max_batch_size}] "
-            f"bs={bs:>2d} | "
-            f"tensors: {tensor_bytes / (1024 ** 2):>7.1f} MB (+{tensor_delta / (1024 ** 2):>6.1f}) | "
-            f"driver:  {(drv_bytes - baseline_driver_bytes) / (1024 ** 2):>7.1f} MB (+{driver_delta / (1024 ** 2):>6.1f})"
+            f"[{i + 1}/{args.max_batch_size}] bs={bs:>2d} | "
+            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{delta_gpu_mb[-1]:>6.1f}) | "
+            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{delta_cpu_mb[-1]:>6.1f})"
         )
+        prev_gpu = gpu
+        prev_cpu = cpu
 
-    output_path = Path(args.output) if args.output else Path(f"vram_{MODEL_ID}.png")
+    autogenerated_name = f"vram_{MODEL_ID}_{'shuffle' if args.shuffle else 'sequential'}.png"
+    output_path = Path(args.output) if args.output else Path(autogenerated_name)
 
-    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
+    fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10))
     fig.suptitle(
-        f"CUDA Graph Cache VRAM (varying batch size) — {MODEL_ID}",
+        f"Memory vs. CUDA Graph Count (varying batch size) — {MODEL_ID}",
         fontsize=14,
     )
 
     capture_order = list(range(1, len(batch_sizes) + 1))
-    bar_width = 0.35
-
-    ax_cum = axes[0]
-    x_cum = np.arange(len(capture_order))
-    ax_cum.bar(
-        x_cum - bar_width / 2, cumulative_driver_mb, bar_width,
-        color="steelblue", label="Driver-level (total GPU)",
-    )
-    ax_cum.bar(
-        x_cum + bar_width / 2, cumulative_tensor_mb, bar_width,
-        color="darkorange", label="Cache tensors only",
-    )
-    ax_cum.set_ylabel("Cumulative VRAM above baseline (MB)")
-    ax_cum.set_xlabel("Number of Cached Graphs (capture order)")
-    ax_cum.set_xticks(x_cum)
+    x = np.arange(len(capture_order))
+    w = 0.35
+
+    ax_cum.bar(x - w / 2, cumulative_gpu_mb, w, color="steelblue", label="GPU VRAM")
+    ax_cum.bar(x + w / 2, cumulative_cpu_mb, w, color="seagreen", label="CPU RSS")
+    ax_cum.set_ylabel("Memory above baseline (MB)")
+    ax_cum.set_xlabel("Capture order")
+    ax_cum.set_xticks(x)
     ax_cum.set_xticklabels(
         [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)],
         fontsize=7,
     )
     ax_cum.legend()
 
-    sorted_indices = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k])
-    sorted_bs = [batch_sizes[k] for k in sorted_indices]
-    sorted_driver = [per_graph_driver_mb[k] for k in sorted_indices]
-    sorted_tensor = [per_graph_tensor_mb[k] for k in sorted_indices]
+    sorted_idx = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k])
+    s_bs = [batch_sizes[k] for k in sorted_idx]
+    s_gpu = [delta_gpu_mb[k] for k in sorted_idx]
+    s_cpu = [delta_cpu_mb[k] for k in sorted_idx]
 
-    ax_pg = axes[1]
-    x_pg = np.arange(len(sorted_bs))
-    ax_pg.bar(
-        x_pg - bar_width / 2, sorted_driver, bar_width,
-        color="steelblue", label="Driver-level (total GPU)",
-    )
-    ax_pg.bar(
-        x_pg + bar_width / 2, sorted_tensor, bar_width,
-        color="darkorange", label="Cache tensors only",
-    )
-    ax_pg.set_ylabel("Per-Graph VRAM (MB)")
-    ax_pg.set_xlabel("Batch Size")
-    ax_pg.set_xticks(x_pg)
-    ax_pg.set_xticklabels([str(bs) for bs in sorted_bs])
-    ax_pg.legend()
+    x2 = np.arange(len(s_bs))
+    ax_delta.bar(x2 - w / 2, s_gpu, w, color="steelblue", label="GPU VRAM")
+    ax_delta.bar(x2 + w / 2, s_cpu, w, color="seagreen", label="CPU RSS")
+    ax_delta.set_ylabel("Per-graph memory delta (MB)")
+    ax_delta.set_xlabel("Batch size")
+    ax_delta.set_xticks(x2)
+    ax_delta.set_xticklabels([str(bs) for bs in s_bs])
+    ax_delta.legend()
 
     plt.tight_layout()
     fig.savefig(output_path, dpi=150)
     print(f"\nPlot saved to {output_path}")
 
-    total_tensor = prev_tensor_bytes / (1024 ** 2)
-    total_driver = (prev_driver_bytes - baseline_driver_bytes) / (1024 ** 2)
+    final_gpu = (prev_gpu - baseline_gpu) / MB
+    final_cpu = (prev_cpu - baseline_cpu) / MB
     n = len(batch_sizes)
     print(f"\nAfter {n} graphs:")
-    print(f"  Cache tensor VRAM:  {total_tensor:.1f} MB (avg {total_tensor / n:.1f} MB/graph)")
-    print(f"  Driver-level VRAM:  {total_driver:.1f} MB (avg {total_driver / n:.1f} MB/graph)")
-    print(f"  Opaque overhead:    {total_driver - total_tensor:.1f} MB (avg {(total_driver - total_tensor) / n:.1f} MB/graph)")
+    print(f"  GPU VRAM: +{final_gpu:.1f} MB total ({final_gpu / n:.1f} MB/graph avg)")
+    print(f"  CPU RSS:  +{final_cpu:.1f} MB total ({final_cpu / n:.1f} MB/graph avg)")
 
 
 if __name__ == "__main__":

From 845fabd0b601be2cb1c603c596cf6493449b27cc Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 20:42:49 +0000
Subject: [PATCH 25/50] tweaks

---
 inference_models/inference_models/models/common/trt.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 1e6ca5c7d8..c618aadbef 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -294,7 +294,7 @@ def infer_from_trt_engine(
     else:
         min_batch_size = trt_config.dynamic_batch_size_min
         max_batch_size = trt_config.dynamic_batch_size_max
-    return _infer_from_trt_engine_with_batch_size_boundaries(
+    return infer_from_trt_engine_with_batch_size_boundaries(
         pre_processed_images=pre_processed_images,
         engine=engine,
         context=context,
@@ -307,7 +307,7 @@ def infer_from_trt_engine(
     )
 
 
-def _infer_from_trt_engine_with_batch_size_boundaries(
+def infer_from_trt_engine_with_batch_size_boundaries(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
     context: Optional[trt.IExecutionContext],
@@ -332,7 +332,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results = _execute_trt_engine(
+        results = execute_trt_engine(
             pre_processed_images=pre_processed_images,
             engine=engine,
             context=context,
@@ -362,7 +362,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries(
                 ),
                 dim=0,
             )
-        results = _execute_trt_engine(
+        results = execute_trt_engine(
             pre_processed_images=batch,
             engine=engine,
             context=context,
@@ -378,7 +378,7 @@ def _infer_from_trt_engine_with_batch_size_boundaries(
     return [torch.cat(e, dim=0).contiguous() for e in all_results]
 
 
-def _execute_trt_engine(
+def execute_trt_engine(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
     context: Optional[trt.IExecutionContext],

From 3294cae83ea59edcd82ca416c9d54bd19c18d20f Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 20:51:18 +0000
Subject: [PATCH 26/50] update tests to work with the new cache

---
 .../integration_tests/models/test_rfdetr_predictions_trt.py | 4 +++-
 .../models/test_rfdetr_seg_predictions_trt.py               | 4 +++-
 .../models/test_yolov8_object_detection_predictions_trt.py  | 6 ++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index 4768fc9043..44ac3dec22 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -4,6 +4,8 @@
 import pytest
 import torch
 
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
+
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -25,7 +27,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_state = None
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
index c5591aab9e..2e8c9759fe 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -2,6 +2,8 @@
 import pytest
 import torch
 
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
+
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -23,7 +25,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = None
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index 6031df5c6e..f5e8e19001 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -2,6 +2,8 @@
 import pytest
 import torch
 
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
+
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -18,7 +20,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     )
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
-    model._trt_cuda_graph_cache = None
+    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
 
     seen_shapes = set()
     capture_outputs = {}
@@ -69,7 +71,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
         no_graph = model.forward(batch, use_cuda_graph=False)
 
-        model._trt_cuda_graph_cache = None
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
         capture_graph = model.forward(batch, use_cuda_graph=True)
         replay_graph = model.forward(batch, use_cuda_graph=True)
 

From bbb25405c9a3662a0dde20aa9977db09f713c0c2 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:01:26 +0000
Subject: [PATCH 27/50] thanks for the PR review, Claude

---
 .../inference_models/models/common/trt.py           | 13 +++++++++++--
 .../test_yolov8_object_detection_predictions_trt.py |  6 +++---
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index c618aadbef..19553cb48f 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -95,7 +95,11 @@ def __setitem__(
         self.cache[key] = value
         self.cache.move_to_end(key)
         if len(self.cache) > self.capacity:
-            self.cache.popitem(last=False)
+            _, evicted = self.cache.popitem(last=False)
+            del evicted.cuda_graph
+            del evicted.input_buffer
+            del evicted.output_buffers
+            del evicted.execution_context
 
 
 def get_trt_engine_inputs_and_outputs(
@@ -393,7 +397,7 @@ def execute_trt_engine(
         cache_key = (input_shape, input_dtype, device)
 
         if cache_key not in trt_cuda_graph_cache:
-            LOGGER.debug(f"Capturing CUDA graph for shape {input_shape}")
+            LOGGER.debug("Capturing CUDA graph for shape %s", input_shape)
 
             results, trt_cuda_graph = _capture_cuda_graph(
                 pre_processed_images=pre_processed_images,
@@ -416,6 +420,11 @@ def execute_trt_engine(
             return results
 
     else:
+        if context is None:
+            raise ModelRuntimeError(
+                message="An execution context is required when not using CUDA graphs.",
+                help_url="https://todo",
+            )
         status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
         if not status:
             raise ModelRuntimeError(
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index f5e8e19001..cb1b5bc238 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -46,7 +46,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
             continue
 
         assert cache_size_after == cache_size_before
-        assert torch.allclose(capture_outputs[cache_key], output, atol=1e-3)
+        assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6)
 
     assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes
 
@@ -75,5 +75,5 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
         capture_graph = model.forward(batch, use_cuda_graph=True)
         replay_graph = model.forward(batch, use_cuda_graph=True)
 
-        assert torch.allclose(no_graph, capture_graph, atol=1e-3)
-        assert torch.allclose(no_graph, replay_graph, atol=1e-3)
+        assert torch.allclose(no_graph, capture_graph, atol=1e-6)
+        assert torch.allclose(no_graph, replay_graph, atol=1e-6)

From 4eb23fce0754379fac6f61ce4c3f92c8cbbcac2d Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:28:30 +0000
Subject: [PATCH 28/50] see effect of cache size on vram profile script

---
 .../profiling/profile_cudagraph_vram.py       | 134 +++++++++++-------
 1 file changed, 85 insertions(+), 49 deletions(-)

diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py
index 412458e114..1d2d9a0964 100644
--- a/inference_models/development/profiling/profile_cudagraph_vram.py
+++ b/inference_models/development/profiling/profile_cudagraph_vram.py
@@ -1,15 +1,34 @@
-"""Profile GPU and CPU memory usage as CUDA graphs are cached.
+"""Profile GPU and CPU memory usage as CUDA graphs are cached and evicted.
 
 Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes
-with batch sizes 1-16 in a deterministic random order, and after each capture
-records both GPU VRAM (driver-level) and process CPU RSS. Produces a two-panel
-plot: cumulative memory over capture order, and per-graph delta sorted by batch
-size.
+with random batch sizes, and after each step records both GPU VRAM
+(driver-level) and process CPU RSS. The cache capacity is smaller than the
+number of distinct batch sizes, so eviction is exercised and memory usage
+should plateau.
 
 Example invocation:
-    python profile_cudagraph_vram.py --device cuda:0
-
-    python profile_cudagraph_vram.py --device cuda:0 --shuffle --max-batch-size 32 --output mem.png
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 16 \
+        --output vram_sequential.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 16 \
+        --shuffle \
+        --output vram_shuffle.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --shuffle \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 8 \
+        --output vram_shuffle_eviction.png
 """
 
 import argparse
@@ -46,7 +65,10 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument("--device", type=str, default="cuda:0")
     parser.add_argument("--max-batch-size", type=int, default=16)
-    parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order (deterministic seed).")
+    parser.add_argument("--cache-capacity", type=int, default=8)
+    parser.add_argument("--num-steps", type=int, default=32)
+    parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order instead of sequential cycling.")
+    parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--output", type=str, default=None)
     return parser.parse_args()
 
@@ -55,12 +77,14 @@ def main() -> None:
     args = parse_args()
     device = torch.device(args.device)
 
+    rng = random.Random(args.seed)
+
     model = AutoModel.from_pretrained(
         model_id_or_path=MODEL_ID,
         device=device,
         backend="trt",
         batch_size=(1, args.max_batch_size),
-        cuda_graph_cache_capacity=args.max_batch_size + 10,
+        cuda_graph_cache_capacity=args.cache_capacity,
     )
 
     image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8)
@@ -75,23 +99,31 @@ def main() -> None:
     baseline_cpu = cpu_rss_bytes()
 
     model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
-        capacity=args.max_batch_size + 10,
+        capacity=args.cache_capacity,
     )
 
-    batch_size_order = list(range(1, args.max_batch_size + 1))
     if args.shuffle:
-        random.Random(42).shuffle(batch_size_order)
+        batch_size_sequence = [
+            rng.randint(1, args.max_batch_size) for _ in range(args.num_steps)
+        ]
+    else:
+        all_sizes = list(range(1, args.max_batch_size + 1))
+        batch_size_sequence = [
+            all_sizes[i % len(all_sizes)] for i in range(args.num_steps)
+        ]
+
+    from collections import defaultdict
 
     batch_sizes = []
     cumulative_gpu_mb = []
     cumulative_cpu_mb = []
-    delta_gpu_mb = []
-    delta_cpu_mb = []
+    gpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list)
+    cpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list)
 
     prev_gpu = baseline_gpu
     prev_cpu = baseline_cpu
 
-    for i, bs in enumerate(batch_size_order):
+    for i, bs in enumerate(batch_size_sequence):
         batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
         output = model.forward(batched, use_cuda_graph=True)
         del output
@@ -100,69 +132,73 @@ def main() -> None:
 
         gpu = gpu_used_bytes(device)
         cpu = cpu_rss_bytes()
+        cache_size = len(model._trt_cuda_graph_cache.cache)
+
+        gpu_delta = (gpu - prev_gpu) / MB
+        cpu_delta = (cpu - prev_cpu) / MB
 
         batch_sizes.append(bs)
         cumulative_gpu_mb.append((gpu - baseline_gpu) / MB)
         cumulative_cpu_mb.append((cpu - baseline_cpu) / MB)
-        delta_gpu_mb.append((gpu - prev_gpu) / MB)
-        delta_cpu_mb.append((cpu - prev_cpu) / MB)
+        gpu_deltas_by_bs[bs].append(gpu_delta)
+        cpu_deltas_by_bs[bs].append(cpu_delta)
 
         print(
-            f"[{i + 1}/{args.max_batch_size}] bs={bs:>2d} | "
-            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{delta_gpu_mb[-1]:>6.1f}) | "
-            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{delta_cpu_mb[-1]:>6.1f})"
+            f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | "
+            f"cache: {cache_size}/{args.cache_capacity} | "
+            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{gpu_delta:>6.1f}) | "
+            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{cpu_delta:>6.1f})"
         )
         prev_gpu = gpu
         prev_cpu = cpu
 
-    autogenerated_name = f"vram_{MODEL_ID}_{'shuffle' if args.shuffle else 'sequential'}.png"
+    mode = "shuffle" if args.shuffle else "sequential"
+    autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png"
     output_path = Path(args.output) if args.output else Path(autogenerated_name)
 
     fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10))
     fig.suptitle(
-        f"Memory vs. CUDA Graph Count (varying batch size) — {MODEL_ID}",
+        f"Memory vs. Step (cache capacity={args.cache_capacity}, "
+        f"batch sizes 1–{args.max_batch_size}) — {MODEL_ID}",
         fontsize=14,
     )
 
-    capture_order = list(range(1, len(batch_sizes) + 1))
-    x = np.arange(len(capture_order))
-    w = 0.35
+    steps = np.arange(len(batch_sizes))
 
-    ax_cum.bar(x - w / 2, cumulative_gpu_mb, w, color="steelblue", label="GPU VRAM")
-    ax_cum.bar(x + w / 2, cumulative_cpu_mb, w, color="seagreen", label="CPU RSS")
+    ax_cum.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM")
+    ax_cum.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS")
     ax_cum.set_ylabel("Memory above baseline (MB)")
-    ax_cum.set_xlabel("Capture order")
-    ax_cum.set_xticks(x)
-    ax_cum.set_xticklabels(
-        [f"{n}\n(bs={bs})" for n, bs in zip(capture_order, batch_sizes)],
-        fontsize=7,
-    )
+    ax_cum.set_xlabel("Step")
+    for i, bs in enumerate(batch_sizes):
+        ax_cum.annotate(
+            str(bs), (i, cumulative_gpu_mb[i]),
+            textcoords="offset points", xytext=(0, 6),
+            fontsize=6, ha="center", color="steelblue",
+        )
     ax_cum.legend()
 
-    sorted_idx = sorted(range(len(batch_sizes)), key=lambda k: batch_sizes[k])
-    s_bs = [batch_sizes[k] for k in sorted_idx]
-    s_gpu = [delta_gpu_mb[k] for k in sorted_idx]
-    s_cpu = [delta_cpu_mb[k] for k in sorted_idx]
+    sorted_bs = sorted(gpu_deltas_by_bs.keys())
+    avg_gpu = [np.mean(gpu_deltas_by_bs[bs]) for bs in sorted_bs]
+    avg_cpu = [np.mean(cpu_deltas_by_bs[bs]) for bs in sorted_bs]
 
-    x2 = np.arange(len(s_bs))
-    ax_delta.bar(x2 - w / 2, s_gpu, w, color="steelblue", label="GPU VRAM")
-    ax_delta.bar(x2 + w / 2, s_cpu, w, color="seagreen", label="CPU RSS")
-    ax_delta.set_ylabel("Per-graph memory delta (MB)")
+    x2 = np.arange(len(sorted_bs))
+    w = 0.35
+    ax_delta.bar(x2 - w / 2, avg_gpu, w, color="steelblue", label="GPU VRAM")
+    ax_delta.bar(x2 + w / 2, avg_cpu, w, color="seagreen", label="CPU RSS")
+    ax_delta.set_ylabel("Mean per-step memory delta (MB)")
     ax_delta.set_xlabel("Batch size")
     ax_delta.set_xticks(x2)
-    ax_delta.set_xticklabels([str(bs) for bs in s_bs])
+    ax_delta.set_xticklabels([str(bs) for bs in sorted_bs])
     ax_delta.legend()
 
     plt.tight_layout()
     fig.savefig(output_path, dpi=150)
     print(f"\nPlot saved to {output_path}")
 
-    final_gpu = (prev_gpu - baseline_gpu) / MB
-    final_cpu = (prev_cpu - baseline_cpu) / MB
-    n = len(batch_sizes)
-    print(f"\nAfter {n} graphs:")
-    print(f"  GPU VRAM: +{final_gpu:.1f} MB total ({final_gpu / n:.1f} MB/graph avg)")
-    print(f"  CPU RSS:  +{final_cpu:.1f} MB total ({final_cpu / n:.1f} MB/graph avg)")
+    print(f"\nFinal GPU VRAM above baseline: {cumulative_gpu_mb[-1]:.1f} MB")
+    print(f"Final CPU RSS above baseline:  {cumulative_cpu_mb[-1]:.1f} MB")
+    print(f"Peak GPU VRAM above baseline:  {max(cumulative_gpu_mb):.1f} MB")
+    print(f"Cache entries at end: {cache_size}/{args.cache_capacity}")
 
 
 if __name__ == "__main__":

From aa87393f0b29b4a2d5bc63be418d07568cb18ed1 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:40:34 +0000
Subject: [PATCH 29/50] reduce default cache size to 16 after seeing memory
 usage

---
 .../profiling/profile_cudagraph_vram.py       | 56 ++++++------------
 .../rfdetr_instance_segmentation_trt.py       |  2 +-
 .../rfdetr/rfdetr_object_detection_trt.py     |  2 +-
 .../yolov8/yolov8_object_detection_trt.py     |  2 +-
 ...yolov8_object_detection_predictions_trt.py | 57 +++++++++++++++++++
 5 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py
index 1d2d9a0964..6996c3b98b 100644
--- a/inference_models/development/profiling/profile_cudagraph_vram.py
+++ b/inference_models/development/profiling/profile_cudagraph_vram.py
@@ -29,6 +29,14 @@
         --max-batch-size 16 \
         --cache-capacity 8 \
         --output vram_shuffle_eviction.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --shuffle \
+        --num-steps 64 \
+        --max-batch-size 2 \
+        --cache-capacity 2 \
+        --output vram_two_batch_sizes.png
 """
 
 import argparse
@@ -112,16 +120,9 @@ def main() -> None:
             all_sizes[i % len(all_sizes)] for i in range(args.num_steps)
         ]
 
-    from collections import defaultdict
-
     batch_sizes = []
     cumulative_gpu_mb = []
     cumulative_cpu_mb = []
-    gpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list)
-    cpu_deltas_by_bs: dict[int, list[float]] = defaultdict(list)
-
-    prev_gpu = baseline_gpu
-    prev_cpu = baseline_cpu
 
     for i, bs in enumerate(batch_size_sequence):
         batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
@@ -134,62 +135,41 @@ def main() -> None:
         cpu = cpu_rss_bytes()
         cache_size = len(model._trt_cuda_graph_cache.cache)
 
-        gpu_delta = (gpu - prev_gpu) / MB
-        cpu_delta = (cpu - prev_cpu) / MB
-
         batch_sizes.append(bs)
         cumulative_gpu_mb.append((gpu - baseline_gpu) / MB)
         cumulative_cpu_mb.append((cpu - baseline_cpu) / MB)
-        gpu_deltas_by_bs[bs].append(gpu_delta)
-        cpu_deltas_by_bs[bs].append(cpu_delta)
 
         print(
             f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | "
             f"cache: {cache_size}/{args.cache_capacity} | "
-            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB (+{gpu_delta:>6.1f}) | "
-            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB (+{cpu_delta:>6.1f})"
+            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB | "
+            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB"
         )
-        prev_gpu = gpu
-        prev_cpu = cpu
 
     mode = "shuffle" if args.shuffle else "sequential"
     autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png"
     output_path = Path(args.output) if args.output else Path(autogenerated_name)
 
-    fig, (ax_cum, ax_delta) = plt.subplots(2, 1, figsize=(14, 10))
+    fig, ax = plt.subplots(figsize=(14, 6))
     fig.suptitle(
         f"Memory vs. Step (cache capacity={args.cache_capacity}, "
-        f"batch sizes 1–{args.max_batch_size}) — {MODEL_ID}",
+        f"batch sizes 1-{args.max_batch_size}) -- {MODEL_ID}",
         fontsize=14,
     )
 
     steps = np.arange(len(batch_sizes))
 
-    ax_cum.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM")
-    ax_cum.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS")
-    ax_cum.set_ylabel("Memory above baseline (MB)")
-    ax_cum.set_xlabel("Step")
+    ax.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM")
+    ax.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS")
+    ax.set_ylabel("Memory above baseline (MB)")
+    ax.set_xlabel("Step")
     for i, bs in enumerate(batch_sizes):
-        ax_cum.annotate(
+        ax.annotate(
             str(bs), (i, cumulative_gpu_mb[i]),
             textcoords="offset points", xytext=(0, 6),
             fontsize=6, ha="center", color="steelblue",
         )
-    ax_cum.legend()
-
-    sorted_bs = sorted(gpu_deltas_by_bs.keys())
-    avg_gpu = [np.mean(gpu_deltas_by_bs[bs]) for bs in sorted_bs]
-    avg_cpu = [np.mean(cpu_deltas_by_bs[bs]) for bs in sorted_bs]
-
-    x2 = np.arange(len(sorted_bs))
-    w = 0.35
-    ax_delta.bar(x2 - w / 2, avg_gpu, w, color="steelblue", label="GPU VRAM")
-    ax_delta.bar(x2 + w / 2, avg_cpu, w, color="seagreen", label="CPU RSS")
-    ax_delta.set_ylabel("Mean per-step memory delta (MB)")
-    ax_delta.set_xlabel("Batch size")
-    ax_delta.set_xticks(x2)
-    ax_delta.set_xticklabels([str(bs) for bs in sorted_bs])
-    ax_delta.legend()
+    ax.legend()
 
     plt.tight_layout()
     fig.savefig(output_path, dpi=150)
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 22ffdc9a22..1aa3e3ed9d 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -81,7 +81,7 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
-        cuda_graph_cache_capacity: int = 64,
+        cuda_graph_cache_capacity: int = 16,
         **kwargs,
     ) -> "RFDetrForInstanceSegmentationTRT":
         if device.type != "cuda":
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index 5b163da87c..29be76b5c1 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -81,7 +81,7 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
-        cuda_graph_cache_capacity: int = 64,
+        cuda_graph_cache_capacity: int = 16,
         **kwargs,
     ) -> "RFDetrForObjectDetectionTRT":
         if device.type != "cuda":
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index 3794d8ee1e..89b067ffbe 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -156,7 +156,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
-        cuda_graph_cache_capacity: int = 64,
+        cuda_graph_cache_capacity: int = 16,
     ):
         self._engine = engine
         self._input_name = input_name
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index cb1b5bc238..bfd9061a3c 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -77,3 +77,60 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
         assert torch.allclose(no_graph, capture_graph, atol=1e-6)
         assert torch.allclose(no_graph, replay_graph, atol=1e-6)
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_cudagraph_cache_eviction(
+    yolov8n_640_t4_trt_package: str,
+    dog_image_numpy: np.ndarray,
+) -> None:
+    from inference_models import AutoModel
+
+    device = torch.device("cuda:0")
+    model = AutoModel.from_pretrained(
+        model_id_or_path=yolov8n_640_t4_trt_package,
+        device=device,
+    )
+
+    pre_processed_single, _ = model.pre_process(dog_image_numpy)
+    capacity = 3
+    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=capacity)
+    cache = model._trt_cuda_graph_cache
+
+    batch_sizes = [1, 2, 3]
+    for bs in batch_sizes:
+        batch = pre_processed_single.repeat(bs, 1, 1, 1)
+        model.forward(batch, use_cuda_graph=True)
+
+    assert len(cache.cache) == capacity
+    keys_before = list(cache.cache.keys())
+
+    batch_4 = pre_processed_single.repeat(4, 1, 1, 1)
+    model.forward(batch_4, use_cuda_graph=True)
+
+    assert len(cache.cache) == capacity
+    assert keys_before[0] not in cache.cache
+    for key in keys_before[1:]:
+        assert key in cache.cache
+    key_4 = (tuple(batch_4.shape), batch_4.dtype, device)
+    assert key_4 in cache.cache
+
+    batch_2 = pre_processed_single.repeat(2, 1, 1, 1)
+    model.forward(batch_2, use_cuda_graph=True)
+
+    batch_5 = pre_processed_single.repeat(5, 1, 1, 1)
+    model.forward(batch_5, use_cuda_graph=True)
+
+    assert len(cache.cache) == capacity
+    key_3 = (tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), batch_2.dtype, device)
+    assert key_3 not in cache.cache
+
+    remaining_keys = list(cache.cache.keys())
+    key_2 = (tuple(batch_2.shape), batch_2.dtype, device)
+    key_5 = (tuple(batch_5.shape), batch_5.dtype, device)
+    assert remaining_keys == [key_4, key_2, key_5]
+
+    no_graph = model.forward(batch_5, use_cuda_graph=False)
+    replay = model.forward(batch_5, use_cuda_graph=True)
+    assert torch.allclose(no_graph, replay, atol=1e-6)

From b5c1f6b2e8c1d9250f267350fcb44a34d7dcf6ee Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:41:36 +0000
Subject: [PATCH 30/50] make style

---
 inference/core/workflows/core_steps/analytics/overlap/v1.py   | 2 +-
 .../core/workflows/core_steps/sinks/onvif_movement/v1.py      | 4 ++--
 inference/core/workflows/core_steps/sinks/twilio/sms/v2.py    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/inference/core/workflows/core_steps/analytics/overlap/v1.py b/inference/core/workflows/core_steps/analytics/overlap/v1.py
index 8404e0681a..7bc73b9d39 100644
--- a/inference/core/workflows/core_steps/analytics/overlap/v1.py
+++ b/inference/core/workflows/core_steps/analytics/overlap/v1.py
@@ -132,7 +132,7 @@ def coords_overlap(
         # coords are [x1, y1, x2, y2]
         if overlap_type == "Center Overlap":
             size = [other[2] - other[0], other[3] - other[1]]
-            (x, y) = [other[0] + size[0] / 2, other[1] + size[1] / 2]
+            x, y = [other[0] + size[0] / 2, other[1] + size[1] / 2]
             return (
                 x > overlap[0] and x < overlap[2] and y > overlap[1] and y < overlap[3]
             )
diff --git a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
index a26f792e73..8351486ad7 100644
--- a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
+++ b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
@@ -874,8 +874,8 @@ def move_camera(
         xyxy = prediction.xyxy
 
         # calculate centers
-        (x1, y1, x2, y2) = tuple(xyxy[0])
-        (image_height, image_width) = tuple(image_dimensions[0])
+        x1, y1, x2, y2 = tuple(xyxy[0])
+        image_height, image_width = tuple(image_dimensions[0])
         center_point = (x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2)
 
         # calculate deltas from center and edge
diff --git a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
index 26c3539881..4bb7493851 100644
--- a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
+++ b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
@@ -518,7 +518,7 @@ def format_message(
 
 
 def process_media_urls_for_twilio(
-    media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData]
+    media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData],
 ) -> Optional[List[str]]:
     """
     Process media URLs for Twilio MMS.
@@ -609,7 +609,7 @@ def _get_mms_placeholder_image_url() -> Optional[str]:
 
 
 def serialize_media_for_api(
-    media_url: Union[str, List[str], WorkflowImageData, None]
+    media_url: Union[str, List[str], WorkflowImageData, None],
 ) -> Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]:
     """
     Serialize media for API transmission.

From a386f3ba6ebf08ef40938ab94cb43d7c80256ede Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:50:38 +0000
Subject: [PATCH 31/50] update default and fix profiling script

---
 .../development/profiling/profile_rfdetr_trt_cudagraphs.py     | 3 ++-
 inference_models/inference_models/models/common/trt.py         | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index 733d462216..fe43027db7 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -7,6 +7,7 @@
 from tqdm import tqdm
 
 from inference_models import AutoModel
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
 IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
@@ -40,7 +41,7 @@ def main() -> None:
     print("Timing with forced CUDA graph recapture each step...")
     start = time.perf_counter()
     for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
-        model._trt_cuda_graph_cache = None
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
         model.forward(pre_processed, use_cuda_graph=True)
        
     cudagraph_recapture_fps = 100 / (time.perf_counter() - start)
diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 19553cb48f..486312bd1b 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -69,7 +69,7 @@ class TRTCudaGraphState:
 
 
 class TRTCudaGraphLRUCache:
-    def __init__(self, capacity: int = 64):
+    def __init__(self, capacity: int = 16):
         self.cache: OrderedDict[
             Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState
         ] = OrderedDict()

From 5f4d3ead36de1cf146e92b361c43f9fce0008b27 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Wed, 11 Feb 2026 21:54:54 +0000
Subject: [PATCH 32/50] fix imports in trt tests

---
 .../models/test_rfdetr_predictions_trt.py                | 6 +++---
 .../models/test_rfdetr_seg_predictions_trt.py            | 5 ++---
 .../test_yolov8_object_detection_predictions_trt.py      | 9 +++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index 44ac3dec22..e84dd1bca5 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -4,8 +4,6 @@
 import pytest
 import torch
 
-from inference_models.models.common.trt import TRTCudaGraphLRUCache
-
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -15,6 +13,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     bike_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
@@ -27,7 +26,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
@@ -64,6 +63,7 @@ def test_trt_outputs_match_expected_shapes(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
index 2e8c9759fe..16cf30512d 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -2,8 +2,6 @@
 import pytest
 import torch
 
-from inference_models.models.common.trt import TRTCudaGraphLRUCache
-
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -13,6 +11,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_seg_nano_t4_trt_package,
@@ -25,7 +24,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index bfd9061a3c..35752e2abc 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -2,8 +2,6 @@
 import pytest
 import torch
 
-from inference_models.models.common.trt import TRTCudaGraphLRUCache
-
 
 @pytest.mark.slow
 @pytest.mark.trt_extras
@@ -12,6 +10,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(
@@ -20,7 +19,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     )
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
-    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
+    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
 
     seen_shapes = set()
     capture_outputs = {}
@@ -58,6 +57,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(
@@ -71,7 +71,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
         no_graph = model.forward(batch, use_cuda_graph=False)
 
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache()
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
         capture_graph = model.forward(batch, use_cuda_graph=True)
         replay_graph = model.forward(batch, use_cuda_graph=True)
 
@@ -86,6 +86,7 @@ def test_trt_cudagraph_cache_eviction(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
+    from inference_models.models.common.trt import TRTCudaGraphLRUCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(

From 3f3be28b886856c4d053d13b64ae58278eeee019 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Mar 2026 18:02:34 +0000
Subject: [PATCH 33/50] further merge conflict resolution

---
 .../models/rfdetr/rfdetr_instance_segmentation_trt.py            | 1 -
 .../models/yolov8/yolov8_object_detection_trt.py                 | 1 -
 2 files changed, 2 deletions(-)

diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 6112881c78..6c056b78f9 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -6,7 +6,6 @@
 
 from inference_models import InstanceDetections, InstanceSegmentationModel
 from inference_models.configuration import (
-    (
     DEFAULT_DEVICE,
     INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE,
     USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index 47f89bfa7a..c46ea7578c 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -6,7 +6,6 @@
 
 from inference_models import Detections, ObjectDetectionModel
 from inference_models.configuration import (
-    (
     DEFAULT_DEVICE,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CLASS_AGNOSTIC_NMS,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,

From 24b8ed48ef223c8430a7064a61f5a80e90bf0c16 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Mar 2026 18:07:15 +0000
Subject: [PATCH 34/50] Revert accidental formatting changes unrelated to
 branch

---
 inference/core/workflows/core_steps/analytics/overlap/v1.py   | 2 +-
 .../core/workflows/core_steps/sinks/onvif_movement/v1.py      | 4 ++--
 inference/core/workflows/core_steps/sinks/twilio/sms/v2.py    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/inference/core/workflows/core_steps/analytics/overlap/v1.py b/inference/core/workflows/core_steps/analytics/overlap/v1.py
index 7bc73b9d39..8404e0681a 100644
--- a/inference/core/workflows/core_steps/analytics/overlap/v1.py
+++ b/inference/core/workflows/core_steps/analytics/overlap/v1.py
@@ -132,7 +132,7 @@ def coords_overlap(
         # coords are [x1, y1, x2, y2]
         if overlap_type == "Center Overlap":
             size = [other[2] - other[0], other[3] - other[1]]
-            x, y = [other[0] + size[0] / 2, other[1] + size[1] / 2]
+            (x, y) = [other[0] + size[0] / 2, other[1] + size[1] / 2]
             return (
                 x > overlap[0] and x < overlap[2] and y > overlap[1] and y < overlap[3]
             )
diff --git a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
index 8351486ad7..a26f792e73 100644
--- a/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
+++ b/inference/core/workflows/core_steps/sinks/onvif_movement/v1.py
@@ -874,8 +874,8 @@ def move_camera(
         xyxy = prediction.xyxy
 
         # calculate centers
-        x1, y1, x2, y2 = tuple(xyxy[0])
-        image_height, image_width = tuple(image_dimensions[0])
+        (x1, y1, x2, y2) = tuple(xyxy[0])
+        (image_height, image_width) = tuple(image_dimensions[0])
         center_point = (x1 + (x2 - x1) / 2, y1 + (y2 - y1) / 2)
 
         # calculate deltas from center and edge
diff --git a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
index 4bb7493851..26c3539881 100644
--- a/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
+++ b/inference/core/workflows/core_steps/sinks/twilio/sms/v2.py
@@ -518,7 +518,7 @@ def format_message(
 
 
 def process_media_urls_for_twilio(
-    media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData],
+    media_url: Union[str, List[Union[str, WorkflowImageData]], WorkflowImageData]
 ) -> Optional[List[str]]:
     """
     Process media URLs for Twilio MMS.
@@ -609,7 +609,7 @@ def _get_mms_placeholder_image_url() -> Optional[str]:
 
 
 def serialize_media_for_api(
-    media_url: Union[str, List[str], WorkflowImageData, None],
+    media_url: Union[str, List[str], WorkflowImageData, None]
 ) -> Tuple[Optional[List[str]], Optional[List[Dict[str, str]]]]:
     """
     Serialize media for API transmission.

From 574e684880b0d4cca83620b5ee38104dbde856c5 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 3 Mar 2026 18:10:53 +0000
Subject: [PATCH 35/50] set this feature flag to false by default

---
 inference_models/inference_models/configuration.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index e31f4859c6..dc84aae176 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -87,7 +87,7 @@
 
 USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env(
     variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND",
-    default=True,
+    default=False,
 )
 
 # General model parameters defaults

From 077732d8db31cd5734b17e8d2a175ea4c33ce2f1 Mon Sep 17 00:00:00 2001
From: Kai Christensen <matthew.kai.christensen@gmail.com>
Date: Tue, 10 Mar 2026 21:40:31 +0000
Subject: [PATCH 36/50] fix cache profiling script

---
 .../development/profiling/profile_yolov8_trt_cudagraphs.py  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py
index 9506b6b1ed..ebbe543a70 100644
--- a/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_yolov8_trt_cudagraphs.py
@@ -12,6 +12,8 @@
 WARMUP = int(os.environ.get("WARMUP", "50"))
 RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100"))
 
+os.environ["USE_TRT_CUDA_GRAPHS"] = "True"
+
 BATCH_SIZES = [1, 2, 3]
 
 
@@ -57,14 +59,14 @@ def main() -> None:
     torch.cuda.synchronize()
     start = time.perf_counter()
     for i in range(RECAPTURE_CYCLES):
-        model._trt_cuda_graph_cache = None
+        model._trt_cuda_graph_cache.cache.clear()
         batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
         model.forward(batch, use_cuda_graph=True)
     torch.cuda.synchronize()
     recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start)
 
     # ── (3) Cycling batch sizes, CUDA graphs with normal caching ────────
-    model._trt_cuda_graph_cache = None
+    model._trt_cuda_graph_cache.cache.clear()
     for batch in batches.values():
         model.forward(batch, use_cuda_graph=True)
 

From b7ea2a09cc6fe2a0bee5475336fab3c6f22af3d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 18:04:11 +0100
Subject: [PATCH 37/50] Add changes to TRT CUDA Graphs cache

---
 .../profiling/profile_cudagraph_vram.py       |   4 +-
 .../profile_rfdetr_trt_cudagraphs.py          |   4 +-
 .../inference_models/configuration.py         |  10 +-
 .../inference_models/developer_tools.py       |  14 +-
 .../inference_models/models/common/trt.py     | 120 +++++++++++++-----
 .../deep_lab_v3_plus_segmentation_trt.py      |  14 ++
 .../resnet/resnet_classification_trt.py       |  26 ++++
 .../rfdetr_instance_segmentation_trt.py       |  28 ++--
 .../rfdetr/rfdetr_object_detection_trt.py     |  28 ++--
 .../models/vit/vit_classification_trt.py      |  26 ++++
 .../yolact_instance_segmentation_trt.py       |  14 ++
 .../yolo26_instance_segmentation_trt.py       |  14 ++
 .../yolo26/yolo26_key_points_detection_trt.py |  15 ++-
 .../yolo26/yolo26_object_detection_trt.py     |  14 ++
 .../yolonas/yolonas_object_detection_trt.py   |  20 ++-
 .../yolov10/yolov10_object_detection_trt.py   |  14 ++
 .../yolov5_instance_segmentation_trt.py       |  18 ++-
 .../yolov5/yolov5_object_detection_trt.py     |  20 ++-
 .../yolov7_instance_segmentation_trt.py       |  18 ++-
 .../yolov8_instance_segmentation_trt.py       |  14 ++
 .../yolov8/yolov8_key_points_detection_trt.py |  15 ++-
 .../yolov8/yolov8_object_detection_trt.py     |  28 ++--
 .../models/test_rfdetr_predictions_trt.py     |  13 +-
 .../models/test_rfdetr_seg_predictions_trt.py |   5 +-
 ...yolov8_object_detection_predictions_trt.py |  19 +--
 25 files changed, 404 insertions(+), 111 deletions(-)

diff --git a/inference_models/development/profiling/profile_cudagraph_vram.py b/inference_models/development/profiling/profile_cudagraph_vram.py
index 6996c3b98b..d129fc38c1 100644
--- a/inference_models/development/profiling/profile_cudagraph_vram.py
+++ b/inference_models/development/profiling/profile_cudagraph_vram.py
@@ -50,7 +50,7 @@
 import torch
 
 from inference_models import AutoModel
-from inference_models.models.common.trt import TRTCudaGraphLRUCache
+from inference_models.models.common.trt import TRTCudaGraphCache
 
 MODEL_ID = "yolov8n-640"
 MB = 1024 ** 2
@@ -106,7 +106,7 @@ def main() -> None:
     baseline_gpu = gpu_used_bytes(device)
     baseline_cpu = cpu_rss_bytes()
 
-    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+    model._trt_cuda_graph_cache = TRTCudaGraphCache(
         capacity=args.cache_capacity,
     )
 
diff --git a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
index fe43027db7..2791e24e3e 100644
--- a/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
+++ b/inference_models/development/profiling/profile_rfdetr_trt_cudagraphs.py
@@ -7,7 +7,7 @@
 from tqdm import tqdm
 
 from inference_models import AutoModel
-from inference_models.models.common.trt import TRTCudaGraphLRUCache
+from inference_models.models.common.trt import TRTCudaGraphCache
 
 IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
 DEVICE = os.environ.get("DEVICE", "cuda:0")
@@ -41,7 +41,7 @@ def main() -> None:
     print("Timing with forced CUDA graph recapture each step...")
     start = time.perf_counter()
     for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
         model.forward(pre_processed, use_cuda_graph=True)
        
     cudagraph_recapture_fps = 100 / (time.perf_counter() - start)
diff --git a/inference_models/inference_models/configuration.py b/inference_models/inference_models/configuration.py
index 5d732c2060..30e34a0c67 100644
--- a/inference_models/inference_models/configuration.py
+++ b/inference_models/inference_models/configuration.py
@@ -88,11 +88,6 @@
     "ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA"
 )
 
-USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env(
-    variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND",
-    default=False,
-)
-
 # General model parameters defaults
 
 INFERENCE_MODELS_DEFAULT_CONFIDENCE = get_float_from_env(
@@ -382,3 +377,8 @@
     variable_name="INFERENCE_MODELS_YOLOLITE_DEFAULT_CLASS_AGNOSTIC_NMS",
     default=INFERENCE_MODELS_DEFAULT_CLASS_AGNOSTIC_NMS,
 )
+
+ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME = (
+    "ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND"
+)
+DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND = False
diff --git a/inference_models/inference_models/developer_tools.py b/inference_models/inference_models/developer_tools.py
index bee4b5ca75..44cbd90da7 100644
--- a/inference_models/inference_models/developer_tools.py
+++ b/inference_models/inference_models/developer_tools.py
@@ -13,7 +13,7 @@
 along with library. Utilities depending on optional dependencies are exposed as lazy imports.
 """
 
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 from inference_models.models.common.model_packages import get_model_package_contents
 from inference_models.runtime_introspection.core import (
@@ -21,7 +21,7 @@
     x_ray_runtime_environment,
 )
 from inference_models.utils.download import download_files_to_directory
-from inference_models.utils.imports import LazyFunction
+from inference_models.utils.imports import LazyClass, LazyFunction
 from inference_models.utils.onnx_introspection import (
     get_selected_onnx_execution_providers,
 )
@@ -42,7 +42,7 @@
     TRTPackageDetails,
 )
 
-OPTIONAL_IMPORTS: Dict[str, LazyFunction] = {
+OPTIONAL_IMPORTS: Dict[str, Union[LazyFunction, LazyClass]] = {
     "use_primary_cuda_context": LazyFunction(
         module_name="inference_models.models.common.cuda",
         function_name="use_primary_cuda_context",
@@ -79,6 +79,14 @@
         module_name="inference_models.models.common.trt",
         function_name="load_trt_model",
     ),
+    "establish_trt_cuda_graph_cache": LazyFunction(
+        module_name="inference_models.models.common.trt",
+        function_name="establish_trt_cuda_graph_cache",
+    ),
+    "TRTCudaGraphCache": LazyClass(
+        module_name="inference_models.models.common.trt",
+        class_name="TRTCudaGraphCache",
+    ),
 }
 
 
diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 0ca05fab04..accc1cb0ee 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -1,9 +1,14 @@
-from typing import List, Optional, Tuple
-from dataclasses import dataclass
+import threading
 from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
 
 import torch
 
+from inference_models.configuration import (
+    DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND,
+    ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME,
+)
 from inference_models.errors import (
     CorruptedModelPackageError,
     MissingDependencyError,
@@ -11,6 +16,7 @@
 )
 from inference_models.logger import LOGGER
 from inference_models.models.common.roboflow.model_packages import TRTConfig
+from inference_models.utils.environment import get_boolean_from_env
 
 try:
     import tensorrt as trt
@@ -74,38 +80,87 @@ class TRTCudaGraphState:
     execution_context: trt.IExecutionContext
 
 
-class TRTCudaGraphLRUCache:
-    def __init__(self, capacity: int = 16):
-        self.cache: OrderedDict[
+class TRTCudaGraphCache:
+    def __init__(self, capacity: int):
+        self._cache: OrderedDict[
             Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState
         ] = OrderedDict()
-        self.capacity = capacity
+        self._capacity = capacity
+        self._state_lock = threading.RLock()
+
+    def get_current_size(self) -> int:
+        return len(self._cache)
+
+    def list_keys(self) -> List[Tuple[Tuple[int, ...], torch.dtype, torch.device]]:
+        return list(self._cache.keys())
+
+    def safe_remove(
+        self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
+    ) -> None:
+        with self._state_lock:
+            if key not in self._cache:
+                return None
+            evicted = self._cache.pop(key)
+            self._evict(evicted=evicted)
+            return None
+
+    def purge(self, n_oldest: Optional[int] = None) -> None:
+        with self._state_lock:
+            if n_oldest is None:
+                n_oldest = len(self._cache)
+            to_evict = min(len(self._cache), n_oldest)
+            for _ in range(to_evict):
+                _, evicted = self._cache.popitem(last=False)
+                self._evict(evicted=evicted, empty_cuda_cache=False)
+            torch.cuda.empty_cache()
 
     def __contains__(
         self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
     ) -> bool:
-        return key in self.cache
+        return key in self._cache
 
     def __getitem__(
         self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
     ) -> TRTCudaGraphState:
-        value = self.cache[key]
-        self.cache.move_to_end(key)
-        return value
+        with self._state_lock:
+            value = self._cache[key]
+            self._cache.move_to_end(key)
+            return value
 
     def __setitem__(
         self,
         key: Tuple[Tuple[int, ...], torch.dtype, torch.device],
         value: TRTCudaGraphState,
     ):
-        self.cache[key] = value
-        self.cache.move_to_end(key)
-        if len(self.cache) > self.capacity:
-            _, evicted = self.cache.popitem(last=False)
-            del evicted.cuda_graph
-            del evicted.input_buffer
-            del evicted.output_buffers
-            del evicted.execution_context
+        with self._state_lock:
+            self._cache[key] = value
+            self._cache.move_to_end(key)
+            if len(self._cache) > self._capacity:
+                _, evicted = self._cache.popitem(last=False)
+                self._evict(evicted=evicted)
+
+    def _evict(self, evicted: TRTCudaGraphState, empty_cuda_cache: bool = True) -> None:
+        del evicted.cuda_graph
+        del evicted.input_buffer
+        del evicted.output_buffers
+        del evicted.execution_context
+        if empty_cuda_cache:
+            torch.cuda.empty_cache()
+
+
+def establish_trt_cuda_graph_cache(
+    default_cuda_graph_cache_size: int,
+    cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+) -> Optional[TRTCudaGraphCache]:
+    if cuda_graph_cache is not None:
+        return cuda_graph_cache
+    auto_cuda_graphs_enabled = get_boolean_from_env(
+        variable_name=ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND_ENV_NAME,
+        default=DEFAULT_ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND,
+    )
+    if not auto_cuda_graphs_enabled:
+        return None
+    return TRTCudaGraphCache(capacity=default_cuda_graph_cache_size)
 
 
 def get_trt_engine_inputs_and_outputs(
@@ -174,12 +229,12 @@ def infer_from_trt_engine(
     pre_processed_images: torch.Tensor,
     trt_config: TRTConfig,
     engine: trt.ICudaEngine,
+    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    context: Optional[trt.IExecutionContext] = None,
-    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
     stream: Optional[torch.cuda.Stream] = None,
+    trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
 ) -> List[torch.Tensor]:
     """Run inference using a TensorRT engine, optionally with CUDA graph acceleration.
 
@@ -276,8 +331,8 @@ def infer_from_trt_engine(
 
         Run with CUDA graph acceleration:
 
-        >>> from inference_models.models.common.trt import TRTCudaGraphLRUCache
-        >>> cache = TRTCudaGraphLRUCache(capacity=16)
+        >>> from inference_models.models.common.trt import TRTCudaGraphCache
+        >>> cache = TRTCudaGraphCache(capacity=16)
         >>>
         >>> results = infer_from_trt_engine(
         ...     pre_processed_images=images,
@@ -324,11 +379,11 @@ def _infer_from_trt_engine(
     pre_processed_images: torch.Tensor,
     trt_config: TRTConfig,
     engine: trt.ICudaEngine,
-    context: Optional[trt.IExecutionContext],
+    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+    trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
 ) -> List[torch.Tensor]:
     if trt_config.static_batch_size is not None:
         min_batch_size = trt_config.static_batch_size
@@ -352,13 +407,13 @@ def _infer_from_trt_engine(
 def _infer_from_trt_engine_with_batch_size_boundaries(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
-    context: Optional[trt.IExecutionContext],
+    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
     min_batch_size: int,
     max_batch_size: int,
-    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+    trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
 ) -> List[torch.Tensor]:
     if pre_processed_images.shape[0] <= max_batch_size:
         reminder = min_batch_size - pre_processed_images.shape[0]
@@ -423,11 +478,11 @@ def _infer_from_trt_engine_with_batch_size_boundaries(
 def _execute_trt_engine(
     pre_processed_images: torch.Tensor,
     engine: trt.ICudaEngine,
-    context: Optional[trt.IExecutionContext],
+    context: trt.IExecutionContext,
     device: torch.device,
     input_name: str,
     outputs: List[str],
-    trt_cuda_graph_cache: Optional[TRTCudaGraphLRUCache] = None,
+    trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
 ) -> List[torch.Tensor]:
     if trt_cuda_graph_cache is not None:
         input_shape = tuple(pre_processed_images.shape)
@@ -458,11 +513,6 @@ def _execute_trt_engine(
             return results
 
     else:
-        if context is None:
-            raise ModelRuntimeError(
-                message="An execution context is required when not using CUDA graphs.",
-                help_url="https://inference-models.roboflow.com/errors/models-runtime/#modelruntimeerror",
-            )
         status = context.set_input_shape(input_name, tuple(pre_processed_images.shape))
         if not status:
             raise ModelRuntimeError(
@@ -511,7 +561,9 @@ def _capture_cuda_graph(
     input_buffer = torch.empty_like(pre_processed_images, device=device)
     input_buffer.copy_(pre_processed_images)
 
-    status = graph_context.set_input_shape(input_name, tuple(pre_processed_images.shape))
+    status = graph_context.set_input_shape(
+        input_name, tuple(pre_processed_images.shape)
+    )
     if not status:
         raise ModelRuntimeError(
             message="Failed to set TRT model input shape during CUDA graph capture.",
diff --git a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
index c807f4a641..2bc949760a 100644
--- a/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
+++ b/inference_models/inference_models/models/deep_lab_v3_plus/deep_lab_v3_plus_segmentation_trt.py
@@ -38,6 +38,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -81,6 +83,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "DeepLabV3PlusForSemanticSegmentationTRT":
         if device.type != "cuda":
@@ -146,6 +150,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -157,6 +165,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -171,6 +180,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -182,6 +192,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -212,8 +223,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: PreprocessedInputs,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -225,6 +238,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/resnet/resnet_classification_trt.py b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
index e55a999515..8bad13c294 100644
--- a/inference_models/inference_models/models/resnet/resnet_classification_trt.py
+++ b/inference_models/inference_models/models/resnet/resnet_classification_trt.py
@@ -40,6 +40,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -81,6 +83,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "ResNetForClassificationTRT":
         if device.type != "cuda":
@@ -147,6 +151,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -157,6 +165,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -170,6 +179,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -180,6 +190,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -212,8 +223,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: PreprocessedInputs,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -225,6 +238,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
@@ -271,6 +285,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "ResNetForMultiLabelClassificationTRT":
         if device.type != "cuda":
@@ -337,6 +353,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -347,6 +367,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -360,6 +381,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -370,6 +392,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -402,8 +425,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: PreprocessedInputs,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -415,6 +440,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
index 097d374465..ebc59bfdf9 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_instance_segmentation_trt.py
@@ -12,7 +12,6 @@
 from inference_models.configuration import (
     DEFAULT_DEVICE,
     INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE,
-    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
 )
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
@@ -35,7 +34,8 @@
     parse_trt_config,
 )
 from inference_models.models.common.trt import (
-    TRTCudaGraphLRUCache,
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -93,7 +93,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
-        cuda_graph_cache_capacity: int = 16,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "RFDetrForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -162,6 +163,10 @@ def from_pretrained(
                 message=f"Implementation assume 3 model outputs, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -173,7 +178,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
-            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -188,7 +193,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
-        cuda_graph_cache_capacity: int = 64,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -200,9 +205,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
-            capacity=cuda_graph_cache_capacity,
-        )
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = threading.Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -235,20 +238,17 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
-        use_cuda_graph: Optional[bool] = None,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        if use_cuda_graph is None:
-            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
-
-        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 detections, labels, masks = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
-                    context=self._execution_context if not use_cuda_graph else None,
+                    context=self._execution_context,
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
diff --git a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
index dfe587d095..cecc0e4c9d 100644
--- a/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
+++ b/inference_models/inference_models/models/rfdetr/rfdetr_object_detection_trt.py
@@ -8,7 +8,6 @@
 from inference_models.configuration import (
     DEFAULT_DEVICE,
     INFERENCE_MODELS_RFDETR_DEFAULT_CONFIDENCE,
-    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
 )
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
@@ -34,7 +33,8 @@
     rescale_image_detections,
 )
 from inference_models.models.common.trt import (
-    TRTCudaGraphLRUCache,
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -86,7 +86,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
-        cuda_graph_cache_capacity: int = 16,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "RFDetrForObjectDetectionTRT":
         if device.type != "cuda":
@@ -160,6 +161,10 @@ def from_pretrained(
                 message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -171,7 +176,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
-            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -186,7 +191,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
-        cuda_graph_cache_capacity: int = 64,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -198,9 +203,7 @@ def __init__(
         self._cuda_context = cuda_context
         self._execution_context = execution_context
         self._trt_config = trt_config
-        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
-            capacity=cuda_graph_cache_capacity,
-        )
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = threading.Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -231,20 +234,17 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
-        use_cuda_graph: Optional[bool] = None,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if use_cuda_graph is None:
-            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
-
-        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 detections, labels = infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
-                    context=self._execution_context if not use_cuda_graph else None,
+                    context=self._execution_context,
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
diff --git a/inference_models/inference_models/models/vit/vit_classification_trt.py b/inference_models/inference_models/models/vit/vit_classification_trt.py
index 948d544d56..0ed60ff0f0 100644
--- a/inference_models/inference_models/models/vit/vit_classification_trt.py
+++ b/inference_models/inference_models/models/vit/vit_classification_trt.py
@@ -40,6 +40,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -81,6 +83,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "VITForClassificationTRT":
         if device.type != "cuda":
@@ -147,6 +151,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -157,6 +165,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -170,6 +179,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -180,6 +190,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -210,8 +221,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: PreprocessedInputs,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -223,6 +236,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
@@ -270,6 +284,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "VITForMultiLabelClassificationTRT":
         if device.type != "cuda":
@@ -336,6 +352,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -346,6 +366,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -359,6 +380,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -369,6 +391,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -399,8 +422,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: PreprocessedInputs,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -412,6 +437,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
index ab2f2648b4..dfdfaf4a29 100644
--- a/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolact/yolact_instance_segmentation_trt.py
@@ -46,6 +46,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -93,6 +95,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOACTForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -154,6 +158,10 @@ def from_pretrained(
                 message=f"Implementation assume 5 model outputs, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -164,6 +172,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -177,6 +186,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -187,6 +197,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -217,8 +228,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 (
@@ -239,6 +252,7 @@ def forward(
                             input_name=self._input_name,
                             outputs=self._output_names,
                             stream=self._inference_stream,
+                            trt_cuda_graph_cache=cache,
                         )
                     )
                     all_loc_data.append(loc_data)
diff --git a/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py b/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py
index ca2cbf454f..cf26334653 100644
--- a/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolo26/yolo26_instance_segmentation_trt.py
@@ -44,6 +44,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -89,6 +91,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLO26ForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -155,6 +159,10 @@ def from_pretrained(
                 message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -165,6 +173,7 @@ def from_pretrained(
             device=device,
             execution_context=execution_context,
             cuda_context=cuda_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -178,6 +187,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -188,6 +198,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -218,8 +229,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 instances, protos = infer_from_trt_engine(
@@ -231,6 +244,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )
                 return instances, protos
 
diff --git a/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py b/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py
index 5dd7bdc141..ee944775cc 100644
--- a/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py
+++ b/inference_models/inference_models/models/yolo26/yolo26_key_points_detection_trt.py
@@ -45,6 +45,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -88,6 +90,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLO26ForKeyPointsDetectionTRT":
         if device.type != "cuda":
@@ -153,6 +157,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -165,6 +173,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -180,12 +189,14 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
         self._output_names = [output_name]
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._class_names = class_names
         self._skeletons = skeletons
         self._inference_config = inference_config
@@ -193,7 +204,6 @@ def __init__(
         self._trt_config = trt_config
         self._device = device
         self._session_thread_lock = Lock()
-        self._parsed_key_points_metadata = parsed_key_points_metadata
         self._key_points_classes_for_instances = torch.tensor(
             [len(e) for e in self._parsed_key_points_metadata], device=device
         )
@@ -237,8 +247,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -250,6 +262,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py b/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py
index f7c299aa9a..b87666d40d 100644
--- a/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolo26/yolo26_object_detection_trt.py
@@ -37,6 +37,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -80,6 +82,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLO26ForObjectDetectionTRT":
         if device.type != "cuda":
@@ -141,6 +145,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -151,6 +159,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -164,6 +173,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -174,6 +184,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = threading.Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -204,8 +215,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -217,6 +230,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
index fd8c3c1c59..d74bcc3cb5 100644
--- a/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolonas/yolonas_object_detection_trt.py
@@ -38,6 +38,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -83,6 +85,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLONasForObjectDetectionTRT":
         if device.type != "cuda":
@@ -155,6 +159,10 @@ def from_pretrained(
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
         # git rid of outputs order and names verification, as YOLO-NAS clearly produces different outputs
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -165,6 +173,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -178,6 +187,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -188,6 +198,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -215,7 +226,13 @@ def pre_process(
         self._pre_process_stream.synchronize()
         return pre_processed_images, pre_processing_meta
 
-    def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(
+        self,
+        pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 results = infer_from_trt_engine(
@@ -227,6 +244,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )
                 return torch.cat(results, dim=-1)
 
diff --git a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
index fb1ec11c73..0950f3fd5a 100644
--- a/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov10/yolov10_object_detection_trt.py
@@ -38,6 +38,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -80,6 +82,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv10ForObjectDetectionTRT":
         if device.type != "cuda":
@@ -141,6 +145,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -151,6 +159,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -164,6 +173,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -174,6 +184,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -204,8 +215,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -217,6 +230,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
index 71da6f20d7..f3b7af3559 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_instance_segmentation_trt.py
@@ -46,6 +46,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -92,6 +94,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv5ForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -158,6 +162,10 @@ def from_pretrained(
                 message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -168,6 +176,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -181,6 +190,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -191,6 +201,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -219,8 +230,12 @@ def pre_process(
         return pre_processed_images, pre_processing_meta
 
     def forward(
-        self, pre_processed_images: torch.Tensor, **kwargs
+        self,
+        pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 instances, protos = infer_from_trt_engine(
@@ -232,6 +247,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )
                 return instances, protos
 
diff --git a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
index d7f671afd1..c61078e3f9 100644
--- a/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov5/yolov5_object_detection_trt.py
@@ -38,6 +38,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -82,6 +84,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv5ForObjectDetectionTRT":
         if device.type != "cuda":
@@ -143,6 +147,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -153,6 +161,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -166,6 +175,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -176,6 +186,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -203,7 +214,13 @@ def pre_process(
         self._pre_process_stream.synchronize()
         return pre_processed_images, pre_processing_meta
 
-    def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
+    def forward(
+        self,
+        pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -215,6 +232,7 @@ def forward(self, pre_processed_images: torch.Tensor, **kwargs) -> torch.Tensor:
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
index 9d8090b34e..044295646f 100644
--- a/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov7/yolov7_instance_segmentation_trt.py
@@ -47,6 +47,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -92,6 +94,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv7ForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -154,6 +158,10 @@ def from_pretrained(
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
         output_tensors = [outputs[0], outputs[4]]
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -164,6 +172,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -177,6 +186,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -189,6 +199,7 @@ def __init__(
         self._execution_context = execution_context
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._thread_local_storage = threading.local()
 
     @property
@@ -215,8 +226,12 @@ def pre_process(
         return pre_processed_images, pre_processing_meta
 
     def forward(
-        self, pre_processed_images: torch.Tensor, **kwargs
+        self,
+        pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 instances, protos = infer_from_trt_engine(
@@ -228,6 +243,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_tensors,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )
                 return instances, protos
 
diff --git a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
index 56c430ccf5..28aa7f5b39 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_instance_segmentation_trt.py
@@ -48,6 +48,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -93,6 +95,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv8ForInstanceSegmentationTRT":
         if device.type != "cuda":
@@ -164,6 +168,10 @@ def from_pretrained(
                 message=f"Expected model outputs to be named `output0` and `output1`, but found: {outputs}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -174,6 +182,7 @@ def from_pretrained(
             device=device,
             execution_context=execution_context,
             cuda_context=cuda_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -187,6 +196,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -200,6 +210,7 @@ def __init__(
         self._session_thread_lock = Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
 
     @property
     def class_names(self) -> List[str]:
@@ -227,8 +238,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 instances, protos = infer_from_trt_engine(
@@ -240,6 +253,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )
                 return instances, protos
 
diff --git a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
index 4adf21965b..cb98489a0c 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_key_points_detection_trt.py
@@ -49,6 +49,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -92,6 +94,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv8ForKeyPointsDetectionTRT":
         if device.type != "cuda":
@@ -162,6 +166,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -174,6 +182,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -189,12 +198,14 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
         self._output_names = [output_name]
         self._cuda_context = cuda_context
         self._execution_context = execution_context
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._class_names = class_names
         self._skeletons = skeletons
         self._inference_config = inference_config
@@ -202,7 +213,6 @@ def __init__(
         self._trt_config = trt_config
         self._device = device
         self._session_thread_lock = Lock()
-        self._parsed_key_points_metadata = parsed_key_points_metadata
         self._key_points_classes_for_instances = torch.tensor(
             [len(e) for e in self._parsed_key_points_metadata], device=device
         )
@@ -246,8 +256,10 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._session_thread_lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
@@ -259,6 +271,7 @@ def forward(
                     input_name=self._input_name,
                     outputs=self._output_names,
                     stream=self._inference_stream,
+                    trt_cuda_graph_cache=cache,
                 )[0]
 
     def post_process(
diff --git a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
index 1e5827b827..1c099fd67d 100644
--- a/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
+++ b/inference_models/inference_models/models/yolov8/yolov8_object_detection_trt.py
@@ -11,7 +11,6 @@
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_CONFIDENCE,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_IOU_THRESHOLD,
     INFERENCE_MODELS_YOLO_ULTRALYTICS_DEFAULT_MAX_DETECTIONS,
-    USE_CUDA_GRAPHS_FOR_TRT_BACKEND,
 )
 from inference_models.entities import ColorFormat
 from inference_models.errors import (
@@ -42,7 +41,8 @@
     pre_process_network_input,
 )
 from inference_models.models.common.trt import (
-    TRTCudaGraphLRUCache,
+    TRTCudaGraphCache,
+    establish_trt_cuda_graph_cache,
     get_trt_engine_inputs_and_outputs,
     infer_from_trt_engine,
     load_trt_model,
@@ -86,7 +86,8 @@ def from_pretrained(
         model_name_or_path: str,
         device: torch.device = DEFAULT_DEVICE,
         engine_host_code_allowed: bool = False,
-        cuda_graph_cache_capacity: int = 64,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
+        default_trt_cuda_graph_cache_size: int = 8,
         **kwargs,
     ) -> "YOLOv8ForObjectDetectionTRT":
         if device.type != "cuda":
@@ -153,6 +154,10 @@ def from_pretrained(
                 message=f"Implementation assume single model output, found: {len(outputs)}.",
                 help_url="https://inference-models.roboflow.com/errors/model-loading/#corruptedmodelpackageerror",
             )
+        trt_cuda_graph_cache = establish_trt_cuda_graph_cache(
+            default_cuda_graph_cache_size=default_trt_cuda_graph_cache_size,
+            cuda_graph_cache=trt_cuda_graph_cache,
+        )
         return cls(
             engine=engine,
             input_name=inputs[0],
@@ -163,7 +168,7 @@ def from_pretrained(
             device=device,
             cuda_context=cuda_context,
             execution_context=execution_context,
-            cuda_graph_cache_capacity=cuda_graph_cache_capacity,
+            trt_cuda_graph_cache=trt_cuda_graph_cache,
         )
 
     def __init__(
@@ -177,7 +182,7 @@ def __init__(
         device: torch.device,
         cuda_context: cuda.Context,
         execution_context: trt.IExecutionContext,
-        cuda_graph_cache_capacity: int = 16,
+        trt_cuda_graph_cache: Optional[TRTCudaGraphCache],
     ):
         self._engine = engine
         self._input_name = input_name
@@ -188,9 +193,7 @@ def __init__(
         self._device = device
         self._cuda_context = cuda_context
         self._execution_context = execution_context
-        self._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
-            capacity=cuda_graph_cache_capacity,
-        )
+        self._trt_cuda_graph_cache = trt_cuda_graph_cache
         self._lock = threading.Lock()
         self._inference_stream = torch.cuda.Stream(device=self._device)
         self._thread_local_storage = threading.local()
@@ -221,20 +224,17 @@ def pre_process(
     def forward(
         self,
         pre_processed_images: torch.Tensor,
-        use_cuda_graph: Optional[bool] = None,
+        disable_cuda_graphs: bool = False,
         **kwargs,
     ) -> torch.Tensor:
-        if use_cuda_graph is None:
-            use_cuda_graph = USE_CUDA_GRAPHS_FOR_TRT_BACKEND
-
-        cache = self._trt_cuda_graph_cache if use_cuda_graph else None
+        cache = self._trt_cuda_graph_cache if not disable_cuda_graphs else None
         with self._lock:
             with use_cuda_context(context=self._cuda_context):
                 return infer_from_trt_engine(
                     pre_processed_images=pre_processed_images,
                     trt_config=self._trt_config,
                     engine=self._engine,
-                    context=self._execution_context if not use_cuda_graph else None,
+                    context=self._execution_context,
                     device=self._device,
                     input_name=self._input_name,
                     outputs=self._output_names,
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index aee7dd7d1c..519bff72f4 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -465,7 +465,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     bike_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
@@ -478,7 +478,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
@@ -508,6 +508,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
                 atol=1e-6,
             )
 
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_outputs_match_expected_shapes(
@@ -515,7 +516,7 @@ def test_trt_outputs_match_expected_shapes(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
@@ -529,12 +530,12 @@ def test_trt_outputs_match_expected_shapes(
     assert output[0].shape == (1, 300, 4)
     assert output[1].shape == (1, 300, 91)
 
-    output = model.forward(pre_processed, use_cuda_graph=True) # capture
+    output = model.forward(pre_processed, use_cuda_graph=True)  # capture
 
     assert output[0].shape == (1, 300, 4)
     assert output[1].shape == (1, 300, 91)
 
-    output = model.forward(pre_processed, use_cuda_graph=True) # replay
+    output = model.forward(pre_processed, use_cuda_graph=True)  # replay
 
     assert output[0].shape == (1, 300, 4)
-    assert output[1].shape == (1, 300, 91)
\ No newline at end of file
+    assert output[1].shape == (1, 300, 91)
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
index ab6972febc..3dbbd8bb38 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -265,7 +265,6 @@ def test_trt_package_torch_batch(
     assert 16050 <= predictions[1].mask.cpu().sum().item() <= 16100
 
 
-
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_cudagraph_output_matches_non_cudagraph_output(
@@ -274,7 +273,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_seg_nano_t4_trt_package,
@@ -287,7 +286,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
         no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
         capture_graph = model.forward(pre_processed, use_cuda_graph=True)
         replay_graph = model.forward(pre_processed, use_cuda_graph=True)
 
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index 2ba7b0d8d4..b03f3954fa 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -451,7 +451,6 @@ def test_trt_package_torch_batch(
     )
 
 
-
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
@@ -459,7 +458,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(
@@ -468,7 +467,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     )
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
-    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+    model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
 
     seen_shapes = set()
     capture_outputs = {}
@@ -506,7 +505,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(
@@ -520,7 +519,7 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
         no_graph = model.forward(batch, use_cuda_graph=False)
 
-        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
         capture_graph = model.forward(batch, use_cuda_graph=True)
         replay_graph = model.forward(batch, use_cuda_graph=True)
 
@@ -535,7 +534,7 @@ def test_trt_cudagraph_cache_eviction(
     dog_image_numpy: np.ndarray,
 ) -> None:
     from inference_models import AutoModel
-    from inference_models.models.common.trt import TRTCudaGraphLRUCache
+    from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
     model = AutoModel.from_pretrained(
@@ -545,7 +544,7 @@ def test_trt_cudagraph_cache_eviction(
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
     capacity = 3
-    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=capacity)
+    model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=capacity)
     cache = model._trt_cuda_graph_cache
 
     batch_sizes = [1, 2, 3]
@@ -573,7 +572,11 @@ def test_trt_cudagraph_cache_eviction(
     model.forward(batch_5, use_cuda_graph=True)
 
     assert len(cache.cache) == capacity
-    key_3 = (tuple(pre_processed_single.repeat(3, 1, 1, 1).shape), batch_2.dtype, device)
+    key_3 = (
+        tuple(pre_processed_single.repeat(3, 1, 1, 1).shape),
+        batch_2.dtype,
+        device,
+    )
     assert key_3 not in cache.cache
 
     remaining_keys = list(cache.cache.keys())

From 44030c2b69371b87edf28f4046d024ba2c32e83d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 18:17:52 +0100
Subject: [PATCH 38/50] Fix baseline TRT tests

---
 .../models/test_rfdetr_predictions_trt.py     | 16 ++---
 .../models/test_rfdetr_seg_predictions_trt.py | 10 ++--
 ...yolov8_object_detection_predictions_trt.py | 59 +++++++++----------
 3 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index 519bff72f4..d05af338e8 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -467,9 +467,11 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     from inference_models import AutoModel
     from inference_models.models.common.trt import TRTCudaGraphCache
 
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
         device=torch.device("cuda:0"),
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
     pre_processed_1, _ = model.pre_process(dog_image_numpy)
@@ -477,10 +479,9 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
-        no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
-        capture_graph = model.forward(pre_processed, use_cuda_graph=True)
-        replay_graph = model.forward(pre_processed, use_cuda_graph=True)
+        no_graph = model.forward(pre_processed, disable_cuda_graphs=True)
+        capture_graph = model.forward(pre_processed)
+        replay_graph = model.forward(pre_processed)
 
         outputs.append((no_graph, capture_graph, replay_graph))
 
@@ -518,6 +519,7 @@ def test_trt_outputs_match_expected_shapes(
     from inference_models import AutoModel
     from inference_models.models.common.trt import TRTCudaGraphCache
 
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
         device=torch.device("cuda:0"),
@@ -525,17 +527,17 @@ def test_trt_outputs_match_expected_shapes(
 
     pre_processed, _ = model.pre_process(dog_image_numpy)
 
-    output = model.forward(pre_processed, use_cuda_graph=False)
+    output = model.forward(pre_processed, disable_cuda_graphs=True)
 
     assert output[0].shape == (1, 300, 4)
     assert output[1].shape == (1, 300, 91)
 
-    output = model.forward(pre_processed, use_cuda_graph=True)  # capture
+    output = model.forward(pre_processed)  # capture
 
     assert output[0].shape == (1, 300, 4)
     assert output[1].shape == (1, 300, 91)
 
-    output = model.forward(pre_processed, use_cuda_graph=True)  # replay
+    output = model.forward(pre_processed)  # replay
 
     assert output[0].shape == (1, 300, 4)
     assert output[1].shape == (1, 300, 91)
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
index 3dbbd8bb38..52644815d6 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -275,9 +275,11 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     from inference_models import AutoModel
     from inference_models.models.common.trt import TRTCudaGraphCache
 
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_seg_nano_t4_trt_package,
         device=torch.device("cuda:0"),
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
     pre_processed_1, _ = model.pre_process(snake_image_numpy)
@@ -285,11 +287,9 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
 
     outputs = []
     for pre_processed in [pre_processed_1, pre_processed_2]:
-        no_graph = model.forward(pre_processed, use_cuda_graph=False)
-        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
-        capture_graph = model.forward(pre_processed, use_cuda_graph=True)
-        replay_graph = model.forward(pre_processed, use_cuda_graph=True)
-
+        no_graph = model.forward(pre_processed, disable_cuda_graphs=True)
+        capture_graph = model.forward(pre_processed)
+        replay_graph = model.forward(pre_processed)
         outputs.append((no_graph, capture_graph, replay_graph))
 
     for image_outputs in outputs:
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index b03f3954fa..51248d45cd 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -461,13 +461,14 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
     from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
     model = AutoModel.from_pretrained(
         model_id_or_path=yolov8n_640_t4_trt_package,
         device=device,
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
-    model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
 
     seen_shapes = set()
     capture_outputs = {}
@@ -477,14 +478,11 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
         batch = pre_processed_single.repeat(batch_size, 1, 1, 1)
         cache_key = (tuple(batch.shape), batch.dtype, device)
 
-        cache_before = model._trt_cuda_graph_cache
-        cache_size_before = len(cache_before.cache) if cache_before is not None else 0
+        cache_size_before = len(trt_cuda_graph_cache.cache)
 
-        output = model.forward(batch, use_cuda_graph=True)
+        output = model.forward(batch)
 
-        cache_after = model._trt_cuda_graph_cache
-        assert cache_after is not None
-        cache_size_after = len(cache_after.cache)
+        cache_size_after = len(trt_cuda_graph_cache.cache)
 
         if cache_key not in seen_shapes:
             assert cache_size_after == cache_size_before + 1
@@ -495,7 +493,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
         assert cache_size_after == cache_size_before
         assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6)
 
-    assert set(model._trt_cuda_graph_cache.cache.keys()) == seen_shapes
+    assert set(trt_cuda_graph_cache.cache.keys()) == seen_shapes
 
 
 @pytest.mark.slow
@@ -508,20 +506,21 @@ def test_trt_cudagraph_output_matches_non_cudagraph_output(
     from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
     model = AutoModel.from_pretrained(
         model_id_or_path=yolov8n_640_t4_trt_package,
         device=device,
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
 
     for batch_size in [1, 4]:
         batch = pre_processed_single.repeat(batch_size, 1, 1, 1)
 
-        no_graph = model.forward(batch, use_cuda_graph=False)
+        no_graph = model.forward(batch, disable_cuda_graphs=True)
 
-        model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=16)
-        capture_graph = model.forward(batch, use_cuda_graph=True)
-        replay_graph = model.forward(batch, use_cuda_graph=True)
+        capture_graph = model.forward(batch)
+        replay_graph = model.forward(batch)
 
         assert torch.allclose(no_graph, capture_graph, atol=1e-6)
         assert torch.allclose(no_graph, replay_graph, atol=1e-6)
@@ -537,53 +536,53 @@ def test_trt_cudagraph_cache_eviction(
     from inference_models.models.common.trt import TRTCudaGraphCache
 
     device = torch.device("cuda:0")
+    trt_cuda_graph_cache = TRTCudaGraphCache(capacity=3)
     model = AutoModel.from_pretrained(
         model_id_or_path=yolov8n_640_t4_trt_package,
         device=device,
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
     pre_processed_single, _ = model.pre_process(dog_image_numpy)
-    capacity = 3
-    model._trt_cuda_graph_cache = TRTCudaGraphCache(capacity=capacity)
-    cache = model._trt_cuda_graph_cache
 
     batch_sizes = [1, 2, 3]
     for bs in batch_sizes:
         batch = pre_processed_single.repeat(bs, 1, 1, 1)
-        model.forward(batch, use_cuda_graph=True)
+        model.forward(batch)
 
-    assert len(cache.cache) == capacity
-    keys_before = list(cache.cache.keys())
+    assert len(trt_cuda_graph_cache.cache) == 3
+    keys_before = list(trt_cuda_graph_cache.list_keys())
 
     batch_4 = pre_processed_single.repeat(4, 1, 1, 1)
-    model.forward(batch_4, use_cuda_graph=True)
+    model.forward(batch_4)
 
-    assert len(cache.cache) == capacity
-    assert keys_before[0] not in cache.cache
+    assert len(trt_cuda_graph_cache.cache) == 3
+    keys_after = trt_cuda_graph_cache.list_keys()
+    assert keys_before[0] not in keys_after
     for key in keys_before[1:]:
-        assert key in cache.cache
+        assert key in keys_after
     key_4 = (tuple(batch_4.shape), batch_4.dtype, device)
-    assert key_4 in cache.cache
+    assert key_4 in trt_cuda_graph_cache
 
     batch_2 = pre_processed_single.repeat(2, 1, 1, 1)
-    model.forward(batch_2, use_cuda_graph=True)
+    model.forward(batch_2)
 
     batch_5 = pre_processed_single.repeat(5, 1, 1, 1)
-    model.forward(batch_5, use_cuda_graph=True)
+    model.forward(batch_5)
 
-    assert len(cache.cache) == capacity
+    assert trt_cuda_graph_cache.get_current_size() == 3
     key_3 = (
         tuple(pre_processed_single.repeat(3, 1, 1, 1).shape),
         batch_2.dtype,
         device,
     )
-    assert key_3 not in cache.cache
+    remaining_keys = trt_cuda_graph_cache.list_keys()
+    assert key_3 not in remaining_keys
 
-    remaining_keys = list(cache.cache.keys())
     key_2 = (tuple(batch_2.shape), batch_2.dtype, device)
     key_5 = (tuple(batch_5.shape), batch_5.dtype, device)
     assert remaining_keys == [key_4, key_2, key_5]
 
-    no_graph = model.forward(batch_5, use_cuda_graph=False)
-    replay = model.forward(batch_5, use_cuda_graph=True)
+    no_graph = model.forward(batch_5, disable_cuda_graphs=True)
+    replay = model.forward(batch_5)
     assert torch.allclose(no_graph, replay, atol=1e-6)

From 917def0f05674e7b90356c2d22525ed4a357f7a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 18:18:47 +0100
Subject: [PATCH 39/50] Bump version

---
 inference_models/pyproject.toml | 2 +-
 inference_models/uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference_models/pyproject.toml b/inference_models/pyproject.toml
index 37aedc9c11..e90907440c 100644
--- a/inference_models/pyproject.toml
+++ b/inference_models/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "inference-models"
-version = "0.20.2"
+version = "0.21.0rc1"
 description = "The new inference engine for Computer Vision models"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
diff --git a/inference_models/uv.lock b/inference_models/uv.lock
index f539a595de..3470775db6 100644
--- a/inference_models/uv.lock
+++ b/inference_models/uv.lock
@@ -916,7 +916,7 @@ wheels = [
 
 [[package]]
 name = "inference-models"
-version = "0.20.2"
+version = "0.21.0rc1"
 source = { virtual = "." }
 dependencies = [
     { name = "accelerate" },

From 002a4e451de532e64f741f288e31a4f8f48f994e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 18:34:50 +0100
Subject: [PATCH 40/50] Extend tests with multi-forward-pass cases to see if
 predictions matches with cuda graphs used

---
 ...ation_tests_inference_experimental_gpu.yml | 12 ++--
 .../test_resnet_classifier_predictions_trt.py | 48 +++++++++++++
 .../models/test_rfdetr_predictions_trt.py     | 65 +++++++++++++++++
 .../models/test_rfdetr_seg_predictions_trt.py | 42 +++++++++++
 .../test_vit_classifier_predictions_trt.py    | 48 +++++++++++++
 ...6_instance_segmentation_predictions_trt.py | 42 +++++++++++
 ...o26_keypoints_detection_predictions_trt.py | 44 ++++++++++++
 ...yolo26_object_detection_predictions_trt.py | 69 +++++++++++++++++++
 .../models/test_yolonas_predictions_trt.py    | 69 +++++++++++++++++++
 ...olov10_object_detection_predictions_trt.py | 41 +++++++++++
 ...8_instance_segmentation_predictions_trt.py | 42 +++++++++++
 ...ov8_keypoints_detection_predictions_trt.py | 42 +++++++++++
 ...yolov8_object_detection_predictions_trt.py | 69 ++++++++++++++++++-
 13 files changed, 626 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml
index af092f41f8..12b9f240dc 100644
--- a/.github/workflows/integration_tests_inference_experimental_gpu.yml
+++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml
@@ -15,6 +15,7 @@ on:
           - ''
           - onnx_extras
           - trt_extras
+          - trt_extras_with_cuda_graphs
           - torch_models
           - hf_vlm_models
       python_version:
@@ -34,10 +35,11 @@ jobs:
       matrix:
         python-version: ["3.12"]
         extras:
-          - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto" }
-          - { install: "trt10", marker: "trt_extras", workers: "auto" }
-          - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1" }
-          - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1" }
+          - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" }
+          - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" }
+          - { install: "trt10", marker: "trt_extras_with_cuda_graphs", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" }
+          - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" }
+          - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" }
     steps:
       - name: 🛎️ Checkout
         if: ${{ (github.event.inputs.extras == '' || github.event.inputs.extras == matrix.extras.marker) && (github.event.inputs.python_version == '' || github.event.inputs.python_version == matrix.python-version) }}
@@ -107,4 +109,4 @@ jobs:
         timeout-minutes: 25
         run: |
           source .venv/bin/activate
-          python -m pytest -n ${{ matrix.extras.workers }} -m "${{ matrix.extras.marker }} and not cpu_only" tests/integration_tests
+          ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND=${{ matrix.extras.enable_auto_cuda_graphs_for_trt }} python -m pytest -n ${{ matrix.extras.workers }} -m "${{ matrix.extras.marker }} and not cpu_only" tests/integration_tests
diff --git a/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py b/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py
index 528135627d..9c0b059f6e 100644
--- a/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_resnet_classifier_predictions_trt.py
@@ -73,6 +73,30 @@ def test_single_label_trt_package_torch(
     assert abs(predictions.confidence[0, 2].item() - 0.9999516010284424) < 1e-3
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_single_label_trt_package_torch_multiple_predictions_in_row(
+    resnet_single_label_cls_trt_package: str,
+    bike_image_torch: np.ndarray,
+) -> None:
+    # given
+    from inference_models.models.resnet.resnet_classification_trt import (
+        ResNetForClassificationTRT,
+    )
+
+    model = ResNetForClassificationTRT.from_pretrained(
+        model_name_or_path=resnet_single_label_cls_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(bike_image_torch)
+
+        # then
+        assert abs(predictions.confidence[0, 2].item() - 0.9999516010284424) < 1e-3
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_single_label_trt_package_torch_list(
@@ -191,6 +215,30 @@ def test_multi_label_trt_package_torch(
     assert abs(predictions[0].confidence[2].item() - 0.99951171875) < 1e-3
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_multi_label_trt_package_torch_multiple_predictions_in_row(
+    resnet_multi_label_cls_trt_package: str,
+    dog_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.resnet.resnet_classification_trt import (
+        ResNetForMultiLabelClassificationTRT,
+    )
+
+    model = ResNetForMultiLabelClassificationTRT.from_pretrained(
+        model_name_or_path=resnet_multi_label_cls_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(dog_image_torch)
+
+        # then
+        assert abs(predictions[0].confidence[2].item() - 0.99951171875) < 1e-3
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_multi_label_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index d05af338e8..feb75b1507 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -243,6 +243,71 @@ def test_trt_package_torch(
     )
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    rfdetr_coin_counting_trt_package: str,
+    coins_counting_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.rfdetr.rfdetr_object_detection_trt import (
+        RFDetrForObjectDetectionTRT,
+    )
+
+    model = RFDetrForObjectDetectionTRT.from_pretrained(
+        model_name_or_path=rfdetr_coin_counting_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(coins_counting_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor(
+                [
+                    0.9815,
+                    0.9674,
+                    0.9638,
+                    0.9620,
+                    0.9584,
+                    0.9565,
+                    0.9560,
+                    0.9543,
+                    0.9520,
+                    0.9491,
+                ]
+            ).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([4, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [
+                [1323, 533, 3071, 1970],
+                [1708, 2572, 1887, 2760],
+                [1172, 2635, 1372, 2850],
+                [1744, 2296, 1914, 2472],
+                [1464, 2305, 1627, 2475],
+                [1255, 2063, 1423, 2233],
+                [1091, 2354, 1253, 2524],
+                [1508, 1884, 1721, 2093],
+                [929, 1843, 1091, 2004],
+                [2681, 802, 2867, 976],
+            ],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
index 52644815d6..04befce4c7 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_seg_predictions_trt.py
@@ -145,6 +145,48 @@ def test_trt_package_torch(
     assert 16050 <= predictions[0].mask.cpu().sum().item() <= 16100
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    rfdetr_seg_asl_trt_package: str,
+    asl_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.rfdetr.rfdetr_instance_segmentation_trt import (
+        RFDetrForInstanceSegmentationTRT,
+    )
+
+    model = RFDetrForInstanceSegmentationTRT.from_pretrained(
+        model_name_or_path=rfdetr_seg_asl_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(asl_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor([0.9527]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([20], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[63, 173, 187, 374]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+        assert 16050 <= predictions[0].mask.cpu().sum().item() <= 16100
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py b/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py
index 5ea6481333..70b6985ae3 100644
--- a/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_vit_classifier_predictions_trt.py
@@ -73,6 +73,30 @@ def test_single_label_trt_package_torch(
     assert abs(predictions.confidence[0, 2].item() - 0.7300973534584045) < 2e-2
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_single_label_trt_package_torch_multiple_predictions_in_row(
+    vit_single_label_cls_trt_package: str,
+    bike_image_torch: np.ndarray,
+) -> None:
+    # given
+    from inference_models.models.vit.vit_classification_trt import (
+        VITForClassificationTRT,
+    )
+
+    model = VITForClassificationTRT.from_pretrained(
+        model_name_or_path=vit_single_label_cls_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(bike_image_torch)
+
+        # then
+        assert abs(predictions.confidence[0, 2].item() - 0.7300973534584045) < 2e-2
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_single_label_trt_package_torch_list(
@@ -191,6 +215,30 @@ def test_multi_label_trt_package_torch(
     assert abs(predictions[0].confidence[2].item() - 0.833984375) < 1e-3
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_multi_label_trt_package_torch_multiple_predictions_in_row(
+    vit_multi_label_cls_trt_package: str,
+    dog_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.vit.vit_classification_trt import (
+        VITForMultiLabelClassificationTRT,
+    )
+
+    model = VITForMultiLabelClassificationTRT.from_pretrained(
+        model_name_or_path=vit_multi_label_cls_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(dog_image_torch)
+
+        # then
+        assert abs(predictions[0].confidence[2].item() - 0.833984375) < 1e-3
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_multi_label_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py
index 65873c080a..14f32ad0a9 100644
--- a/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolo26_instance_segmentation_predictions_trt.py
@@ -145,6 +145,48 @@ def test_trt_package_torch(
     assert 16500 <= predictions[0].mask.cpu().sum().item() <= 16600
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolo26_seg_asl_trt_package: str,
+    asl_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolo26.yolo26_instance_segmentation_trt import (
+        YOLO26ForInstanceSegmentationTRT,
+    )
+
+    model = YOLO26ForInstanceSegmentationTRT.from_pretrained(
+        model_name_or_path=yolo26_seg_asl_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(asl_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor([0.9671]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([20], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[63, 174, 186, 368]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+        assert 16500 <= predictions[0].mask.cpu().sum().item() <= 16600
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py
index ce74b631f6..c4d8083077 100644
--- a/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolo26_keypoints_detection_predictions_trt.py
@@ -144,6 +144,50 @@ def test_trt_package_torch(
     assert abs(predictions[0][0].confidence.sum().item() - 26.268831253051758) < 1e-2
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolo26_pose_trt_package: str,
+    people_walking_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolo26.yolo26_key_points_detection_trt import (
+        YOLO26ForKeyPointsDetectionTRT,
+    )
+
+    model = YOLO26ForKeyPointsDetectionTRT.from_pretrained(
+        model_name_or_path=yolo26_pose_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(people_walking_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[1][0].confidence.cpu(),
+            torch.tensor([0.9271, 0.9230]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[1][0].class_id.cpu(),
+            torch.tensor([0, 0], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[353, 129, 539, 758], [618, 123, 822, 771]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[1][0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+        assert (
+            abs(predictions[0][0].confidence.sum().item() - 26.268831253051758) < 1e-2
+        )
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py
index ddd5823858..811f32f9cb 100644
--- a/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolo26_object_detection_predictions_trt.py
@@ -247,6 +247,75 @@ def test_trt_package_torch(
     )
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolo26_object_detections_coin_counting_trt_package: str,
+    coins_counting_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolo26.yolo26_object_detection_trt import (
+        YOLO26ForObjectDetectionTRT,
+    )
+
+    model = YOLO26ForObjectDetectionTRT.from_pretrained(
+        model_name_or_path=yolo26_object_detections_coin_counting_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(coins_counting_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor(
+                [
+                    0.9837,
+                    0.9707,
+                    0.9196,
+                    0.8495,
+                    0.8418,
+                    0.8408,
+                    0.5737,
+                    0.4922,
+                    0.4282,
+                    0.4273,
+                    0.2606,
+                ]
+            ).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([2, 2, 2, 1, 3, 0, 0, 0, 3, 1, 3], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [
+                [
+                    [1252, 2049, 1431, 2241],
+                    [1741, 2286, 1921, 2480],
+                    [1707, 2565, 1896, 2770],
+                    [1164, 2624, 1382, 2856],
+                    [1502, 1867, 1728, 2096],
+                    [1459, 2296, 1633, 2476],
+                    [923, 1836, 1100, 2009],
+                    [1090, 2346, 1268, 2525],
+                    [1256, 2059, 1425, 2234],
+                    [1164, 2626, 1381, 2857],
+                    [2671, 792, 2875, 979],
+                ]
+            ],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
index 21dadc4b49..ce7a2c2a2e 100644
--- a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
@@ -253,6 +253,75 @@ def test_trt_package_torch(
     )
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolo_nas_coin_counting_trt_package: str,
+    coins_counting_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolonas.yolonas_object_detection_trt import (
+        YOLONasForObjectDetectionTRT,
+    )
+
+    model = YOLONasForObjectDetectionTRT.from_pretrained(
+        model_name_or_path=yolo_nas_coin_counting_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(coins_counting_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor(
+                [
+                    0.8929,
+                    0.8762,
+                    0.8625,
+                    0.8573,
+                    0.8434,
+                    0.7718,
+                    0.7705,
+                    0.7628,
+                    0.6723,
+                    0.6343,
+                    0.4533,
+                    0.4388,
+                ]
+            ).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([2, 1, 0, 0, 0, 0, 3, 3, 2, 2, 0, 1], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [
+                [1693, 2548, 1910, 2774],
+                [1161, 2618, 1389, 2868],
+                [1445, 2291, 1641, 2483],
+                [913, 1823, 1110, 2017],
+                [1080, 2334, 1275, 2537],
+                [1727, 2285, 1931, 2482],
+                [2664, 763, 2887, 1001],
+                [1491, 1862, 1740, 2101],
+                [1727, 2283, 1932, 2487],
+                [1238, 2041, 1438, 2243],
+                [1485, 1864, 1743, 2106],
+                [1236, 2040, 1439, 2245],
+            ],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py
index 00159c653c..e35b16c3a5 100644
--- a/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov10_object_detection_predictions_trt.py
@@ -255,3 +255,44 @@ def test_trt_package_torch_batch(
         expected_xyxy.cpu(),
         atol=5,
     )
+
+
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolov10_object_detection_trt_package: str,
+    dog_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolov10.yolov10_object_detection_trt import (
+        YOLOv10ForObjectDetectionTRT,
+    )
+
+    model = YOLOv10ForObjectDetectionTRT.from_pretrained(
+        model_name_or_path=yolov10_object_detection_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(dog_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor([0.5039]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([16], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[71, 253, 646, 970]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py
index 39a27c75df..01c6bd6ee7 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_instance_segmentation_predictions_trt.py
@@ -145,6 +145,48 @@ def test_trt_package_torch(
     assert 16100 <= predictions[0].mask.cpu().sum().item() <= 16200
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolov8_seg_asl_trt_package: str,
+    asl_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolov8.yolov8_instance_segmentation_trt import (
+        YOLOv8ForInstanceSegmentationTRT,
+    )
+
+    model = YOLOv8ForInstanceSegmentationTRT.from_pretrained(
+        model_name_or_path=yolov8_seg_asl_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(asl_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor([0.9795]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([20], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[63, 174, 187, 368]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+        assert 16100 <= predictions[0].mask.cpu().sum().item() <= 16200
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py
index 03f6e40db0..a6e60b8bd1 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_keypoints_detection_predictions_trt.py
@@ -144,6 +144,48 @@ def test_trt_package_torch(
     assert abs(predictions[0][0].confidence.sum().item() - 26.07147979736328) < 1e-2
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolov8_pose_trt_package: str,
+    people_walking_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolov8.yolov8_key_points_detection_trt import (
+        YOLOv8ForKeyPointsDetectionTRT,
+    )
+
+    model = YOLOv8ForKeyPointsDetectionTRT.from_pretrained(
+        model_name_or_path=yolov8_pose_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    for _ in range(8):
+        # when
+        predictions = model(people_walking_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[1][0].confidence.cpu(),
+            torch.tensor([0.8783, 0.8719]).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[1][0].class_id.cpu(),
+            torch.tensor([0, 0], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [[351, 124, 540, 756], [619, 120, 824, 767]],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[1][0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+        assert abs(predictions[0][0].confidence.sum().item() - 26.07147979736328) < 1e-2
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index 51248d45cd..ba580ed21e 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -237,6 +237,71 @@ def test_trt_package_torch(
     )
 
 
+@pytest.mark.slow
+@pytest.mark.trt_extras
+def test_trt_package_torch_multiple_predictions_in_row(
+    yolov8_coin_counting_trt_package: str,
+    coins_counting_image_torch: torch.Tensor,
+) -> None:
+    # given
+    from inference_models.models.yolov8.yolov8_object_detection_trt import (
+        YOLOv8ForObjectDetectionTRT,
+    )
+
+    model = YOLOv8ForObjectDetectionTRT.from_pretrained(
+        model_name_or_path=yolov8_coin_counting_trt_package,
+        engine_host_code_allowed=True,
+    )
+
+    # when
+    for _ in range(8):
+        predictions = model(coins_counting_image_torch)
+
+        # then
+        assert torch.allclose(
+            predictions[0].confidence.cpu(),
+            torch.tensor(
+                [
+                    0.9956,
+                    0.9727,
+                    0.9653,
+                    0.9468,
+                    0.9448,
+                    0.9390,
+                    0.9302,
+                    0.9287,
+                    0.9155,
+                    0.9019,
+                ]
+            ).cpu(),
+            atol=0.01,
+        )
+        assert torch.allclose(
+            predictions[0].class_id.cpu(),
+            torch.tensor([4, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.int32).cpu(),
+        )
+        expected_xyxy = torch.tensor(
+            [
+                [1304, 614, 3024, 1918],
+                [1714, 2571, 1884, 2759],
+                [2678, 806, 2866, 974],
+                [1744, 2294, 1914, 2469],
+                [1260, 2058, 1424, 2233],
+                [1469, 2302, 1624, 2467],
+                [929, 1843, 1091, 1997],
+                [1514, 1880, 1718, 2089],
+                [1177, 2632, 1374, 2846],
+                [1099, 2348, 1260, 2522],
+            ],
+            dtype=torch.int32,
+        )
+        assert torch.allclose(
+            predictions[0].xyxy.cpu(),
+            expected_xyxy.cpu(),
+            atol=5,
+        )
+
+
 @pytest.mark.slow
 @pytest.mark.trt_extras
 def test_trt_package_torch_list(
@@ -478,7 +543,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
         batch = pre_processed_single.repeat(batch_size, 1, 1, 1)
         cache_key = (tuple(batch.shape), batch.dtype, device)
 
-        cache_size_before = len(trt_cuda_graph_cache.cache)
+        cache_size_before = trt_cuda_graph_cache.get_current_size()
 
         output = model.forward(batch)
 
@@ -550,7 +615,7 @@ def test_trt_cudagraph_cache_eviction(
         batch = pre_processed_single.repeat(bs, 1, 1, 1)
         model.forward(batch)
 
-    assert len(trt_cuda_graph_cache.cache) == 3
+    assert trt_cuda_graph_cache.get_current_size() == 3
     keys_before = list(trt_cuda_graph_cache.list_keys())
 
     batch_4 = pre_processed_single.repeat(4, 1, 1, 1)

From 6648c5cdb873851c71ee8f0cbd179b1d0bba7df9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:08:03 +0100
Subject: [PATCH 41/50] Adjust tests and add docs

---
 ...ation_tests_inference_experimental_gpu.yml |   4 +-
 inference_models/docs/changelog.md            |   9 +
 .../docs/how-to/use-cuda-graphs.md            | 210 ++++++++++++++++++
 inference_models/mkdocs.yml                   |   1 +
 .../models/test_rfdetr_predictions_trt.py     |   1 +
 ...yolov8_object_detection_predictions_trt.py |   4 +-
 6 files changed, 225 insertions(+), 4 deletions(-)
 create mode 100644 inference_models/docs/how-to/use-cuda-graphs.md

diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml
index 12b9f240dc..60fbde84a2 100644
--- a/.github/workflows/integration_tests_inference_experimental_gpu.yml
+++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml
@@ -28,7 +28,7 @@ on:
 
 jobs:
   integration-tests-inference-models-gpu:
-    name: ${{ matrix.extras.marker }}:${{ matrix.python-version }}
+    name: ${{ matrix.extras.marker }}:${{ matrix.python-version }}:cuda-graphs:${{ matrix.extras.enable_auto_cuda_graphs_for_trt }}
     runs-on: Roboflow-GPU-VM-Runner
     timeout-minutes: 30
     strategy:
@@ -37,7 +37,7 @@ jobs:
         extras:
           - { install: "onnx-cu12,mediapipe", marker: "onnx_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" }
           - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "false" }
-          - { install: "trt10", marker: "trt_extras_with_cuda_graphs", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" }
+          - { install: "trt10", marker: "trt_extras", workers: "auto", enable_auto_cuda_graphs_for_trt: "true" }
           - { install: "torch-cu124,mediapipe", marker: "torch_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" }
           - { install: "torch-cu124", marker: "hf_vlm_models", workers: "1", "enable_auto_cuda_graphs_for_trt": "false" }
     steps:
diff --git a/inference_models/docs/changelog.md b/inference_models/docs/changelog.md
index 26180bc47f..0ca5d597b7 100644
--- a/inference_models/docs/changelog.md
+++ b/inference_models/docs/changelog.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## `0.21.0`
+### Added
+
+- Support for CUDA Graphs in TRT backend - all TRT models got upgraded - added ability to run with CUDA graphs, at 
+the expense of additional VRAM allocation, but with caller control on how many execution contexts for different 
+input shapes should be allowed.
+
+---
+
 ## `0.20.2`
 ### Added
 
diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md
new file mode 100644
index 0000000000..b484343d57
--- /dev/null
+++ b/inference_models/docs/how-to/use-cuda-graphs.md
@@ -0,0 +1,210 @@
+# Using CUDA Graphs with TensorRT Models
+
+CUDA graphs capture a sequence of GPU operations and replay them as a single unit, eliminating per-call
+CPU overhead. For TensorRT models in `inference_models`, this translates to a **7–12% FPS improvement**
+on repeated inference with the same input shape.
+
+## Overview
+
+When CUDA graphs are enabled, the first `forward()` call for a given input shape captures the TensorRT
+execution into a CUDA graph. Subsequent calls with the same shape replay the captured graph instead of
+re-launching individual GPU kernels. Captured graphs are stored in an LRU cache keyed by
+`(shape, dtype, device)`.
+
+CUDA graphs work with all TRT model classes that use `infer_from_trt_engine` — including object detection,
+instance segmentation, keypoint detection, classification, and semantic segmentation models.
+
+## Prerequisites
+
+- A CUDA-capable GPU
+- TensorRT installed (brought in by `trt-*` extras of `inference-models`)
+- A TRT model package (`.plan` engine file)
+
+## Quick Start
+
+The simplest way to enable CUDA graphs is through the `USE_CUDA_GRAPHS_FOR_TRT_BACKEND` environment
+variable:
+
+```bash
+export USE_CUDA_GRAPHS_FOR_TRT_BACKEND=True
+```
+
+With this set, all TRT models loaded via `AutoModel.from_pretrained` will automatically create a CUDA
+graph cache and use it during inference. No code changes required.
+
+```python
+import torch
+from inference_models import AutoModel
+
+model = AutoModel.from_pretrained(
+    model_id_or_path="rfdetr-nano",
+    device=torch.device("cuda:0"),
+    backend="trt",
+)
+
+# First call captures the CUDA graph for this input shape
+results = model.predict(image)
+
+# Subsequent calls replay the captured graph — faster
+results = model.predict(image)
+```
+
+## Manual Cache Control
+
+For more control over cache behavior, create a `TRTCudaGraphCache` explicitly and pass it
+to `AutoModel.from_pretrained`:
+
+```python
+import torch
+from inference_models import AutoModel
+from inference_models.models.common.trt import TRTCudaGraphCache
+
+cache = TRTCudaGraphCache(capacity=16)
+
+model = AutoModel.from_pretrained(
+    model_id_or_path="rfdetr-nano",
+    device=torch.device("cuda:0"),
+    backend="trt",
+    trt_cuda_graph_cache=cache,
+)
+```
+
+The `capacity` parameter controls how many distinct input shapes can be cached simultaneously.
+When the cache is full, the least recently used graph is evicted automatically.
+
+### Inspecting the Cache
+
+You can query the cache at any time to see what's been captured:
+
+```python
+# Check how many graphs are currently cached
+print(cache.get_current_size())  # e.g. 3
+
+# List all cached keys — each key is a (shape, dtype, device) tuple
+for key in cache.list_keys():
+    shape, dtype, device = key
+    print(f"  shape={shape}, dtype={dtype}, device={device}")
+
+# Check if a specific shape is cached
+key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0"))
+if key in cache:
+    print("Graph is cached for this shape")
+```
+
+### Removing Specific Entries
+
+Use `safe_remove()` to evict a single cached graph by its key. This releases the associated
+CUDA graph, execution context, and GPU buffers immediately. If the key doesn't exist, the
+call is a no-op:
+
+```python
+key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0"))
+cache.safe_remove(key)
+```
+
+### Purging the Cache
+
+Use `purge()` to evict multiple entries at once. When called without arguments, it clears the
+entire cache. You can also pass `n_oldest` to evict only the N least recently used entries:
+
+```python
+# Evict the 4 oldest (least recently used) entries
+cache.purge(n_oldest=4)
+
+# Clear the entire cache
+cache.purge()
+```
+
+`purge()` is more efficient than calling `safe_remove()` in a loop because it batches the
+GPU memory cleanup — `torch.cuda.empty_cache()` is called once at the end rather than after
+each individual eviction.
+
+!!! tip "When to purge manually"
+    Manual purging is useful when you know the workload is about to change — for example,
+    switching from processing video at one resolution to another. Purging stale entries
+    frees VRAM for the new shapes before they're captured.
+
+### Sharing a Cache Across Models
+
+Please **do not share single instance of `TRTCudaGraphCache`** to multiple models - as cache object is bound to 
+specific model instance.
+
+### Choosing Cache Capacity
+
+Each cached graph holds its own TensorRT execution context and GPU memory buffers. A reasonable
+default is **8–16 entries**. Consider:
+
+- **Fixed input shape** (e.g. always 1×3×640×640): `capacity=1` is sufficient.
+- **Variable batch sizes** (e.g. batch 1–16): set capacity to the number of distinct batch sizes
+  you expect, or quantize to powers of two and set `capacity=4–5`.
+- **Memory-constrained environments**: lower the capacity to reduce VRAM usage.
+
+## Disabling CUDA Graphs Per Call
+
+Even with a cache configured, you can bypass CUDA graphs for individual forward passes using the
+`disable_cuda_graphs` flag:
+
+```python
+pre_processed, meta = model.pre_process(image)
+
+# Standard path — uses CUDA graphs if cache is configured
+output = model.forward(pre_processed)
+
+# Bypass CUDA graphs for this specific call
+output = model.forward(pre_processed, disable_cuda_graphs=True)
+```
+
+This is useful for debugging, benchmarking, or when you need to compare graph vs. non-graph outputs.
+
+
+## How It Works
+
+The lifecycle of a CUDA graph in `inference_models`:
+
+1. **Cache miss** — `infer_from_trt_engine` detects that no cached graph exists for the current
+   `(shape, dtype, device)` key. It creates a dedicated TensorRT execution context, allocates
+   input/output buffers, runs a warmup pass, then captures the execution into a `torch.cuda.CUDAGraph`.
+   The graph and its associated state are stored in the cache.
+
+2. **Cache hit** — On subsequent calls with the same key, the cached graph's input buffer is updated
+   via `copy_()`, the graph is replayed, and output buffers are cloned and returned. No TensorRT
+   context setup or kernel launches happen on the CPU side.
+
+3. **Eviction** — When the cache exceeds its capacity, the least recently used entry is evicted.
+   The associated CUDA graph, execution context, and GPU buffers are released, and
+   `torch.cuda.empty_cache()` is called to return memory to the CUDA driver.
+
+
+## Important Considerations
+
+### VRAM Usage
+
+Each cache entry consumes GPU memory for input buffers, output buffers, and the TensorRT execution
+context's internal workspace. With large models or high cache capacities, this can be significant.
+Monitor VRAM usage when tuning `capacity`.
+
+### Thread Safety
+
+One may manage cache entries and eviction from separate thread compared to the one running forward-pass.
+The cache state is synchronized with thread lock.
+
+### Dynamic Batch Sizes
+
+CUDA graphs are shape-specific — a graph captured for batch size 4 cannot be replayed for batch size 8.
+If your application uses variable batch sizes, each distinct size will trigger a separate graph capture.
+The LRU cache handles this transparently, but be aware that frequent shape changes will cause cache
+churn and recapture overhead.
+
+!!! tip "Quantize batch sizes for better cache utilization"
+
+    If you control the batching logic, round batch sizes up to the nearest power of two
+    (1, 2, 4, 8, 16). This reduces the number of distinct shapes and keeps the cache small.
+
+### When CUDA Graphs Won't Help
+
+- **Cold start / single inference**: The first call for each shape pays the capture cost, which is
+  slower than a normal forward pass. CUDA graphs only pay off on subsequent replays.
+- **Highly variable input shapes**: If every call has a unique shape, graphs are captured but
+  never replayed.
+- **CPU-bound pipelines**: If your bottleneck is preprocessing or postprocessing, the GPU-side
+  speedup from graph replay won't be visible end-to-end.
diff --git a/inference_models/mkdocs.yml b/inference_models/mkdocs.yml
index b68348e4e8..56b32f565c 100644
--- a/inference_models/mkdocs.yml
+++ b/inference_models/mkdocs.yml
@@ -103,6 +103,7 @@ nav:
       - Load Models Locally: how-to/local-packages.md
       - Understand Roboflow Model Packages: how-to/roboflow-model-packages.md
       - Manage Cache: how-to/cache-management.md
+      - Use CUDA Graphs: how-to/use-cuda-graphs.md
   - Contributors:
       - Development Environment: contributors/dev-environment.md
       - Core Architecture: contributors/core-architecture.md
diff --git a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
index feb75b1507..b067349920 100644
--- a/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_rfdetr_predictions_trt.py
@@ -588,6 +588,7 @@ def test_trt_outputs_match_expected_shapes(
     model = AutoModel.from_pretrained(
         model_id_or_path=rfdetr_nano_t4_trt_package,
         device=torch.device("cuda:0"),
+        trt_cuda_graph_cache=trt_cuda_graph_cache,
     )
 
     pre_processed, _ = model.pre_process(dog_image_numpy)
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index ba580ed21e..3c3853987f 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -547,7 +547,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
 
         output = model.forward(batch)
 
-        cache_size_after = len(trt_cuda_graph_cache.cache)
+        cache_size_after = trt_cuda_graph_cache.get_current_size()
 
         if cache_key not in seen_shapes:
             assert cache_size_after == cache_size_before + 1
@@ -621,7 +621,7 @@ def test_trt_cudagraph_cache_eviction(
     batch_4 = pre_processed_single.repeat(4, 1, 1, 1)
     model.forward(batch_4)
 
-    assert len(trt_cuda_graph_cache.cache) == 3
+    assert trt_cuda_graph_cache.get_current_size() == 3
     keys_after = trt_cuda_graph_cache.list_keys()
     assert keys_before[0] not in keys_after
     for key in keys_before[1:]:

From f4a2788e0ddb6869785e1a2b6ffd213e6ad90383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:09:45 +0100
Subject: [PATCH 42/50] Adjust docs

---
 inference_models/docs/how-to/use-cuda-graphs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md
index b484343d57..fa86b5d54d 100644
--- a/inference_models/docs/how-to/use-cuda-graphs.md
+++ b/inference_models/docs/how-to/use-cuda-graphs.md
@@ -22,11 +22,11 @@ instance segmentation, keypoint detection, classification, and semantic segmenta
 
 ## Quick Start
 
-The simplest way to enable CUDA graphs is through the `USE_CUDA_GRAPHS_FOR_TRT_BACKEND` environment
+The simplest way to enable CUDA graphs is through the `ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND` environment
 variable:
 
 ```bash
-export USE_CUDA_GRAPHS_FOR_TRT_BACKEND=True
+export ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND=True
 ```
 
 With this set, all TRT models loaded via `AutoModel.from_pretrained` will automatically create a CUDA

From a820aae23dcfa751946fc3b189ce3b0e4d6b77f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:11:33 +0100
Subject: [PATCH 43/50] Adjust docs

---
 inference_models/docs/how-to/use-cuda-graphs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inference_models/docs/how-to/use-cuda-graphs.md b/inference_models/docs/how-to/use-cuda-graphs.md
index fa86b5d54d..58e1eacd8e 100644
--- a/inference_models/docs/how-to/use-cuda-graphs.md
+++ b/inference_models/docs/how-to/use-cuda-graphs.md
@@ -57,7 +57,7 @@ to `AutoModel.from_pretrained`:
 ```python
 import torch
 from inference_models import AutoModel
-from inference_models.models.common.trt import TRTCudaGraphCache
+from inference_models.developer_tools import TRTCudaGraphCache
 
 cache = TRTCudaGraphCache(capacity=16)
 

From 4a9b62b7a922c635af0e5fd6d55980d98244fafc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:25:59 +0100
Subject: [PATCH 44/50] Add more docs

---
 .../trt/establish-trt-cuda-graph-cache.md     |   6 +
 .../trt/get-trt-engine-inputs-and-outputs.md  |   2 +-
 .../trt/trt-cuda-graph-cache.md               |   6 +
 .../inference_models/models/common/trt.py     | 207 +++++++++++++++++-
 inference_models/mkdocs.yml                   |   2 +
 ...yolov8_object_detection_predictions_trt.py |   2 +-
 6 files changed, 220 insertions(+), 5 deletions(-)
 create mode 100644 inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md
 create mode 100644 inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md

diff --git a/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md b/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md
new file mode 100644
index 0000000000..3442d233ac
--- /dev/null
+++ b/inference_models/docs/api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md
@@ -0,0 +1,6 @@
+# establish_trt_cuda_graph_cache
+
+::: inference_models.models.common.trt.establish_trt_cuda_graph_cache
+    options:
+      show_root_heading: true
+      show_source: false
diff --git a/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md b/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md
index 98179cf56c..301102ca68 100644
--- a/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md
+++ b/inference_models/docs/api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md
@@ -1,4 +1,4 @@
-2# get_trt_engine_inputs_and_outputs
+# get_trt_engine_inputs_and_outputs
 
 ::: inference_models.models.common.trt.get_trt_engine_inputs_and_outputs
     options:
diff --git a/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md b/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md
new file mode 100644
index 0000000000..e074a3c063
--- /dev/null
+++ b/inference_models/docs/api-reference/developer-tools/trt/trt-cuda-graph-cache.md
@@ -0,0 +1,6 @@
+# TRTCudaGraphCache
+
+::: inference_models.models.common.trt.TRTCudaGraphCache
+    options:
+      show_root_heading: true
+      show_source: false
diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index accc1cb0ee..982f9644e5 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -81,6 +81,42 @@ class TRTCudaGraphState:
 
 
 class TRTCudaGraphCache:
+
+    """LRU cache for captured CUDA graphs used in TensorRT inference.
+
+    Stores captured ``torch.cuda.CUDAGraph`` objects keyed by input
+    ``(shape, dtype, device)`` tuples. When the cache exceeds its capacity,
+    the least recently used entry is evicted and its GPU resources are released.
+
+    The cache is thread-safe — all mutating operations acquire an internal
+    ``threading.RLock``.
+
+    Args:
+        capacity: Maximum number of CUDA graphs to store. Each entry holds
+            a dedicated TensorRT execution context and GPU memory buffers,
+            so higher values increase VRAM usage.
+
+    Examples:
+        Create a cache and pass it to a model:
+
+        >>> from inference_models.models.common.trt import TRTCudaGraphCache
+        >>> from inference_models import AutoModel
+        >>> import torch
+        >>>
+        >>> cache = TRTCudaGraphCache(capacity=16)
+        >>> model = AutoModel.from_pretrained(
+        ...     model_id_or_path="rfdetr-nano",
+        ...     device=torch.device("cuda:0"),
+        ...     backend="trt",
+        ...     trt_cuda_graph_cache=cache,
+        ... )
+
+    See Also:
+        - ``establish_trt_cuda_graph_cache()``: Factory that creates a cache
+          based on environment configuration
+        - ``infer_from_trt_engine()``: Uses the cache during TRT inference
+    """
+
     def __init__(self, capacity: int):
         self._cache: OrderedDict[
             Tuple[Tuple[int, ...], torch.dtype, torch.device], TRTCudaGraphState
@@ -89,14 +125,67 @@ def __init__(self, capacity: int):
         self._state_lock = threading.RLock()
 
     def get_current_size(self) -> int:
-        return len(self._cache)
+        """Return the number of CUDA graphs currently stored in the cache.
+
+        Returns:
+            Number of cached entries.
+
+        Examples:
+            >>> cache = TRTCudaGraphCache(capacity=16)
+            >>> cache.get_current_size()
+            0
+        """
+        with self._state_lock:
+            return len(self._cache)
 
     def list_keys(self) -> List[Tuple[Tuple[int, ...], torch.dtype, torch.device]]:
-        return list(self._cache.keys())
+        """Return a list of all keys currently in the cache.
+
+        Each key is a ``(shape, dtype, device)`` tuple representing a cached
+        CUDA graph. Keys are returned in insertion order (oldest first), which
+        reflects eviction priority.
+
+        Returns:
+            List of ``(shape, dtype, device)`` tuples for all cached entries.
+
+        Examples:
+            >>> cache = TRTCudaGraphCache(capacity=16)
+            >>> # ... after some forward passes ...
+            >>> for shape, dtype, device in cache.list_keys():
+            ...     print(f"Cached: shape={shape}, dtype={dtype}")
+        """
+        with self._state_lock:
+            return list(self._cache.keys())
 
     def safe_remove(
         self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
     ) -> None:
+        """Remove a single entry from the cache by its key.
+
+        If the key exists, the associated CUDA graph, execution context, and
+        GPU buffers are released and ``torch.cuda.empty_cache()`` is called.
+        If the key does not exist, this method is a no-op.
+
+        Args:
+            key: A ``(shape, dtype, device)`` tuple identifying the entry
+                to remove.
+
+        Examples:
+            Remove a cached graph for a specific input shape:
+
+            >>> import torch
+            >>> key = ((1, 3, 384, 384), torch.float16, torch.device("cuda:0"))
+            >>> cache.safe_remove(key)
+
+            Safe to call with a non-existent key:
+
+            >>> cache.safe_remove(((99, 99), torch.float32, torch.device("cuda:0")))
+            >>> # no error raised
+
+        See Also:
+            - ``purge()``: Remove multiple entries at once with batched
+              GPU memory cleanup
+        """
         with self._state_lock:
             if key not in self._cache:
                 return None
@@ -105,6 +194,40 @@ def safe_remove(
             return None
 
     def purge(self, n_oldest: Optional[int] = None) -> None:
+        """Remove entries from the cache, starting with the least recently used.
+
+        When called without arguments, clears the entire cache. When
+        ``n_oldest`` is specified, only that many entries are evicted
+        (or all entries if the cache contains fewer).
+
+        GPU memory cleanup (``torch.cuda.empty_cache()``) is called once
+        after all evictions, making this more efficient than calling
+        ``safe_remove()`` in a loop.
+
+        Args:
+            n_oldest: Number of least recently used entries to evict.
+                When ``None`` (default), all entries are removed.
+
+        Examples:
+            Evict the 4 oldest entries:
+
+            >>> cache.purge(n_oldest=4)
+
+            Clear the entire cache:
+
+            >>> cache.purge()
+            >>> cache.get_current_size()
+            0
+
+        Note:
+            - Eviction order follows LRU policy — entries that haven't been
+              accessed recently are removed first
+            - Each evicted entry's CUDA graph, execution context, and GPU
+              buffers are released
+
+        See Also:
+            - ``safe_remove()``: Remove a single entry by key
+        """
         with self._state_lock:
             if n_oldest is None:
                 n_oldest = len(self._cache)
@@ -117,7 +240,8 @@ def purge(self, n_oldest: Optional[int] = None) -> None:
     def __contains__(
         self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
     ) -> bool:
-        return key in self._cache
+        with self._state_lock:
+            return key in self._cache
 
     def __getitem__(
         self, key: Tuple[Tuple[int, ...], torch.dtype, torch.device]
@@ -152,6 +276,83 @@ def establish_trt_cuda_graph_cache(
     default_cuda_graph_cache_size: int,
     cuda_graph_cache: Optional[TRTCudaGraphCache] = None,
 ) -> Optional[TRTCudaGraphCache]:
+    """Establish a CUDA graph cache for TensorRT inference acceleration.
+
+    Resolves which CUDA graph cache to use for a TRT model. If the caller
+    provides a cache instance, it is returned as-is. Otherwise, the function
+    checks the ``ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND`` environment variable
+    to decide whether to create a new cache automatically. When the environment
+    variable is disabled (the default), no cache is created and CUDA graphs
+    are not used.
+
+    This function is typically called inside ``from_pretrained()`` of TRT model
+    classes. End users who want explicit control should create a
+    ``TRTCudaGraphCache`` themselves and pass it to ``AutoModel.from_pretrained``.
+
+    Args:
+        default_cuda_graph_cache_size: Maximum number of CUDA graphs to cache
+            when a new cache is created automatically. Each entry holds a
+            dedicated TensorRT execution context and GPU memory buffers, so
+            higher values increase VRAM usage.
+
+        cuda_graph_cache: Optional pre-existing cache instance. When provided,
+            it is returned directly and the environment variable is ignored.
+            This allows callers to share a single cache across multiple models
+            or to configure capacity explicitly.
+
+    Returns:
+        A ``TRTCudaGraphCache`` instance if CUDA graphs should be used, or
+        ``None`` if they are disabled. When ``None`` is returned, the model
+        falls back to standard TensorRT execution without graph capture.
+
+    Examples:
+        Automatic cache creation via environment variable:
+
+        >>> import os
+        >>> os.environ["ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND"] = "True"
+        >>>
+        >>> from inference_models.models.common.trt import (
+        ...     establish_trt_cuda_graph_cache,
+        ... )
+        >>>
+        >>> cache = establish_trt_cuda_graph_cache(default_cuda_graph_cache_size=8)
+        >>> print(type(cache))  # <class 'TRTCudaGraphCache'>
+
+        Caller-provided cache takes priority:
+
+        >>> from inference_models.models.common.trt import (
+        ...     TRTCudaGraphCache,
+        ...     establish_trt_cuda_graph_cache,
+        ... )
+        >>>
+        >>> my_cache = TRTCudaGraphCache(capacity=32)
+        >>> result = establish_trt_cuda_graph_cache(
+        ...     default_cuda_graph_cache_size=8,
+        ...     cuda_graph_cache=my_cache,
+        ... )
+        >>> assert result is my_cache  # returned as-is
+
+        Typical usage inside a model's from_pretrained:
+
+        >>> cache = establish_trt_cuda_graph_cache(
+        ...     default_cuda_graph_cache_size=8,
+        ...     cuda_graph_cache=None,  # let env var decide
+        ... )
+        >>> # cache is None when env var is disabled (default)
+
+    Note:
+        - The environment variable ``ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND``
+          defaults to ``False``
+        - When a caller-provided cache is given, the environment variable
+          is not checked
+        - CUDA graphs require TensorRT and a CUDA-capable GPU
+        - Each cached graph consumes VRAM proportional to the model's
+          execution context size
+
+    See Also:
+        - ``TRTCudaGraphCache``: The LRU cache class for CUDA graph state
+        - ``infer_from_trt_engine()``: Uses the cache during TRT inference
+    """
     if cuda_graph_cache is not None:
         return cuda_graph_cache
     auto_cuda_graphs_enabled = get_boolean_from_env(
diff --git a/inference_models/mkdocs.yml b/inference_models/mkdocs.yml
index 56b32f565c..983cc25323 100644
--- a/inference_models/mkdocs.yml
+++ b/inference_models/mkdocs.yml
@@ -148,6 +148,8 @@ nav:
                   - get_trt_engine_inputs_and_outputs: api-reference/developer-tools/trt/get-trt-engine-inputs-and-outputs.md
                   - infer_from_trt_engine: api-reference/developer-tools/trt/infer-from-trt-engine.md
                   - load_trt_model: api-reference/developer-tools/trt/load-trt-model.md
+                  - establish_trt_cuda_graph_cache: api-reference/developer-tools/trt/establish-trt-cuda-graph-cache.md
+                  - TRTCudaGraphCache: api-reference/developer-tools/trt/trt-cuda-graph-cache.md
           - Entities:
               - RuntimeXRayResult: api-reference/developer-tools/runtime-xray-result.md
               - ModelMetadata: api-reference/developer-tools/model-metadata.md
diff --git a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
index 3c3853987f..1648beac82 100644
--- a/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolov8_object_detection_predictions_trt.py
@@ -558,7 +558,7 @@ def test_trt_cudagraph_cache_reuses_previously_seen_input_shapes(
         assert cache_size_after == cache_size_before
         assert torch.allclose(capture_outputs[cache_key], output, atol=1e-6)
 
-    assert set(trt_cuda_graph_cache.cache.keys()) == seen_shapes
+    assert set(trt_cuda_graph_cache.list_keys()) == seen_shapes
 
 
 @pytest.mark.slow

From f9aeec80a496d2c746b0c32de162fc48a8c29cb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:26:40 +0100
Subject: [PATCH 45/50] Fix GH workflow

---
 .../workflows/integration_tests_inference_experimental_gpu.yml   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/integration_tests_inference_experimental_gpu.yml b/.github/workflows/integration_tests_inference_experimental_gpu.yml
index 60fbde84a2..328b583061 100644
--- a/.github/workflows/integration_tests_inference_experimental_gpu.yml
+++ b/.github/workflows/integration_tests_inference_experimental_gpu.yml
@@ -15,7 +15,6 @@ on:
           - ''
           - onnx_extras
           - trt_extras
-          - trt_extras_with_cuda_graphs
           - torch_models
           - hf_vlm_models
       python_version:

From 3e6dd5cb14b039b128a676db34e5d03843e86ce2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:47:33 +0100
Subject: [PATCH 46/50] Enforce replay after cuda graph is recorded to get
 actual results

---
 inference_models/inference_models/models/common/trt.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 982f9644e5..3f70639923 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -811,6 +811,12 @@ def _capture_cuda_graph(
         results = [buf.clone() for buf in output_buffers]
     stream.synchronize()
 
+    # in order to avoid drift of results - it's better to replay to get the results
+    with torch.cuda.stream(stream):
+        cuda_graph.replay()
+        results = [buf.clone() for buf in output_buffers]
+    stream.synchronize()
+
     trt_cuda_graph_state = TRTCudaGraphState(
         cuda_graph=cuda_graph,
         cuda_stream=stream,

From ba4f5f246e9683a1031c40b5bae01077ada151f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 19:57:42 +0100
Subject: [PATCH 47/50] Alter YOLONAS tests to ensure repeatable predictions
 with warmup

---
 .../models/test_yolonas_predictions_trt.py     | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
index ce7a2c2a2e..3ee3ab4535 100644
--- a/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
+++ b/inference_models/tests/integration_tests/models/test_yolonas_predictions_trt.py
@@ -20,6 +20,9 @@ def test_trt_package_numpy(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model(coins_counting_image_numpy)
     predictions = model(coins_counting_image_numpy)
 
     # then
@@ -88,6 +91,9 @@ def test_trt_package_batch_numpy(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model([coins_counting_image_numpy, coins_counting_image_numpy])
     predictions = model([coins_counting_image_numpy, coins_counting_image_numpy])
 
     # then
@@ -202,6 +208,9 @@ def test_trt_package_torch(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model(coins_counting_image_torch)
     predictions = model(coins_counting_image_torch)
 
     # then
@@ -270,6 +279,9 @@ def test_trt_package_torch_multiple_predictions_in_row(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model(coins_counting_image_torch)
     for _ in range(8):
         predictions = model(coins_counting_image_torch)
 
@@ -339,6 +351,9 @@ def test_trt_package_torch_list(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model([coins_counting_image_torch, coins_counting_image_torch])
     predictions = model([coins_counting_image_torch, coins_counting_image_torch])
 
     # then
@@ -453,6 +468,9 @@ def test_trt_package_torch_batch(
     )
 
     # when
+    # warmup
+    for _ in range(5):
+        _ = model(torch.stack([coins_counting_image_torch, coins_counting_image_torch], dim=0))
     predictions = model(
         torch.stack([coins_counting_image_torch, coins_counting_image_torch], dim=0)
     )

From d10ecfb73b31f6a2df933c9cd2e34ca6e6d6d245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 20:37:21 +0100
Subject: [PATCH 48/50] Fix imports in docscrings

---
 inference_models/inference_models/models/common/trt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference_models/inference_models/models/common/trt.py b/inference_models/inference_models/models/common/trt.py
index 3f70639923..4a07f4c144 100644
--- a/inference_models/inference_models/models/common/trt.py
+++ b/inference_models/inference_models/models/common/trt.py
@@ -99,7 +99,7 @@ class TRTCudaGraphCache:
     Examples:
         Create a cache and pass it to a model:
 
-        >>> from inference_models.models.common.trt import TRTCudaGraphCache
+        >>> from inference_models.developer_tools import TRTCudaGraphCache
         >>> from inference_models import AutoModel
         >>> import torch
         >>>
@@ -311,7 +311,7 @@ def establish_trt_cuda_graph_cache(
         >>> import os
         >>> os.environ["ENABLE_AUTO_CUDA_GRAPHS_FOR_TRT_BACKEND"] = "True"
         >>>
-        >>> from inference_models.models.common.trt import (
+        >>> from inference_models.developer_tools import (
         ...     establish_trt_cuda_graph_cache,
         ... )
         >>>

From 4e89392d0dad4d5d21e587755d8d8f7b07c8fa71 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 20:48:15 +0100
Subject: [PATCH 49/50] Bump version

---
 inference_models/pyproject.toml | 2 +-
 inference_models/uv.lock        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inference_models/pyproject.toml b/inference_models/pyproject.toml
index e90907440c..b601421715 100644
--- a/inference_models/pyproject.toml
+++ b/inference_models/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "inference-models"
-version = "0.21.0rc1"
+version = "0.21.0"
 description = "The new inference engine for Computer Vision models"
 readme = "README.md"
 requires-python = ">=3.10,<3.13"
diff --git a/inference_models/uv.lock b/inference_models/uv.lock
index 3470775db6..4708931782 100644
--- a/inference_models/uv.lock
+++ b/inference_models/uv.lock
@@ -916,7 +916,7 @@ wheels = [
 
 [[package]]
 name = "inference-models"
-version = "0.21.0rc1"
+version = "0.21.0"
 source = { virtual = "." }
 dependencies = [
     { name = "accelerate" },

From 1d4e9614fa166692b026e640eed177a2854ab0fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pawe=C5=82=20P=C4=99czek?= <pawel@roboflow.com>
Date: Wed, 18 Mar 2026 20:49:54 +0100
Subject: [PATCH 50/50] Bump version of inference-models in inference
 requirements

---
 requirements/_requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/_requirements.txt b/requirements/_requirements.txt
index 6c47cf8365..18df5cb1cf 100644
--- a/requirements/_requirements.txt
+++ b/requirements/_requirements.txt
@@ -50,4 +50,4 @@ filelock>=3.12.0,<=3.17.0
 onvif-zeep-async==2.0.0 # versions > 2.0.0 will not work with Python 3.9 despite docs
 simple-pid~=2.0.1
 qrcode~=8.0.0
-inference-models~=0.20.2
+inference-models~=0.21.0