roboflow · PawelPeczek-Roboflow · Mar 19, 2026 · Jan 23, 2026 · Jan 23, 2026 · Jan 23, 2026
@@ -0,0 +1,185 @@
+"""Profile GPU and CPU memory usage as CUDA graphs are cached and evicted.
+
+Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes
+with random batch sizes, and after each step records both GPU VRAM
+(driver-level) and process CPU RSS. The cache capacity is smaller than the
+number of distinct batch sizes, so eviction is exercised and memory usage
+should plateau.
+
+Example invocation:
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 16 \
+        --output vram_sequential.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 16 \
+        --shuffle \
+        --output vram_shuffle.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --shuffle \
+        --num-steps 64 \
+        --max-batch-size 16 \
+        --cache-capacity 8 \
+        --output vram_shuffle_eviction.png
+
+    python profile_cudagraph_vram.py \
+        --device cuda:0 \
+        --shuffle \
+        --num-steps 64 \
+        --max-batch-size 2 \
+        --cache-capacity 2 \
+        --output vram_two_batch_sizes.png
+"""
+
+import argparse
+import gc
+import os
+import random
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from inference_models import AutoModel
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
+
+MODEL_ID = "yolov8n-640"
+MB = 1024 ** 2
+
+
+def gpu_used_bytes(device: torch.device) -> int:
+    free, total = torch.cuda.mem_get_info(device)
+    return total - free
+
+
+def cpu_rss_bytes() -> int:
+    with open(f"/proc/{os.getpid()}/statm") as f:
+        pages = int(f.read().split()[1])
+    return pages * os.sysconf("SC_PAGE_SIZE")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Profile GPU + CPU memory vs. number of cached CUDA graphs.",
+    )
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--max-batch-size", type=int, default=16)
+    parser.add_argument("--cache-capacity", type=int, default=8)
+    parser.add_argument("--num-steps", type=int, default=32)
+    parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order instead of sequential cycling.")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output", type=str, default=None)
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    device = torch.device(args.device)
+
+    rng = random.Random(args.seed)
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path=MODEL_ID,
+        device=device,
+        backend="trt",
+        batch_size=(1, args.max_batch_size),
+        cuda_graph_cache_capacity=args.cache_capacity,
+    )
+
+    image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8)
+    single_preprocessed, _ = model.pre_process(image)
+
+    model.forward(single_preprocessed, use_cuda_graph=False)
+    gc.collect()
+    torch.cuda.synchronize(device)
+    torch.cuda.empty_cache()
+
+    baseline_gpu = gpu_used_bytes(device)
+    baseline_cpu = cpu_rss_bytes()
+
+    model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
+        capacity=args.cache_capacity,
+    )
+
+    if args.shuffle:
+        batch_size_sequence = [
+            rng.randint(1, args.max_batch_size) for _ in range(args.num_steps)
+        ]
+    else:
+        all_sizes = list(range(1, args.max_batch_size + 1))
+        batch_size_sequence = [
+            all_sizes[i % len(all_sizes)] for i in range(args.num_steps)
+        ]
+
+    batch_sizes = []
+    cumulative_gpu_mb = []
+    cumulative_cpu_mb = []
+
+    for i, bs in enumerate(batch_size_sequence):
+        batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
+        output = model.forward(batched, use_cuda_graph=True)
+        del output
+        gc.collect()
+        torch.cuda.synchronize(device)
+
+        gpu = gpu_used_bytes(device)
+        cpu = cpu_rss_bytes()
+        cache_size = len(model._trt_cuda_graph_cache.cache)
+
+        batch_sizes.append(bs)
+        cumulative_gpu_mb.append((gpu - baseline_gpu) / MB)
+        cumulative_cpu_mb.append((cpu - baseline_cpu) / MB)
+
+        print(
+            f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | "
+            f"cache: {cache_size}/{args.cache_capacity} | "
+            f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB | "
+            f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB"
+        )
+
+    mode = "shuffle" if args.shuffle else "sequential"
+    autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png"
+    output_path = Path(args.output) if args.output else Path(autogenerated_name)
+
+    fig, ax = plt.subplots(figsize=(14, 6))
+    fig.suptitle(
+        f"Memory vs. Step (cache capacity={args.cache_capacity}, "
+        f"batch sizes 1-{args.max_batch_size}) -- {MODEL_ID}",
+        fontsize=14,
+    )
+
+    steps = np.arange(len(batch_sizes))
+
+    ax.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM")
+    ax.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS")
+    ax.set_ylabel("Memory above baseline (MB)")
+    ax.set_xlabel("Step")
+    for i, bs in enumerate(batch_sizes):
+        ax.annotate(
+            str(bs), (i, cumulative_gpu_mb[i]),
+            textcoords="offset points", xytext=(0, 6),
+            fontsize=6, ha="center", color="steelblue",
+        )
+    ax.legend()
+
+    plt.tight_layout()
+    fig.savefig(output_path, dpi=150)
+    print(f"\nPlot saved to {output_path}")
+
+    print(f"\nFinal GPU VRAM above baseline: {cumulative_gpu_mb[-1]:.1f} MB")
+    print(f"Final CPU RSS above baseline:  {cumulative_cpu_mb[-1]:.1f} MB")
+    print(f"Peak GPU VRAM above baseline:  {max(cumulative_gpu_mb):.1f} MB")
+    print(f"Cache entries at end: {cache_size}/{args.cache_capacity}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,66 @@
+import os
+import time
+
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from inference_models import AutoModel
+from inference_models.models.common.trt import TRTCudaGraphLRUCache
+
+IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
+DEVICE = os.environ.get("DEVICE", "cuda:0")
+CYCLES = int(os.environ.get("CYCLES", "10_000"))
+WARMUP = int(os.environ.get("WARMUP", "50"))
+
+
+def main() -> None:
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
+    )
+
+    if IMAGE_PATH is not None:
+        image = cv2.imread(IMAGE_PATH)
+    else:
+        image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+
+    pre_processed, _ = model.pre_process(image)
+
+    for _ in range(WARMUP):
+        model.forward(pre_processed, use_cuda_graph=False)
+        model.forward(pre_processed, use_cuda_graph=True)
+
+    print("Timing without CUDA graphs...")
+    start = time.perf_counter()
+    for _ in range(CYCLES):
+        model.forward(pre_processed, use_cuda_graph=False)
+    baseline_fps = CYCLES / (time.perf_counter() - start)
+
+    print("Timing with forced CUDA graph recapture each step...")
+    start = time.perf_counter()
+    for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
+        model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
+        model.forward(pre_processed, use_cuda_graph=True)
+
+    cudagraph_recapture_fps = 100 / (time.perf_counter() - start)
+
+    print("Timing with CUDA graph caching and replaying...")
+    model.forward(pre_processed, use_cuda_graph=True) # initial capture
+    start = time.perf_counter()
+    for _ in range(CYCLES):
+        model.forward(pre_processed, use_cuda_graph=True)
+    cudagraph_replay_fps = CYCLES / (time.perf_counter() - start)
+
+    print(f"\n{'='*50}")
+    print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
+    print(f"Forward pass FPS (CUDA graphs recapture):    {cudagraph_recapture_fps:.1f}")
+    print(f"Speed factor (recapture): {cudagraph_recapture_fps / baseline_fps:.2f}x")
+    print(f"Forward pass FPS (CUDA graphs replay):    {cudagraph_replay_fps:.1f}")
+    print(f"Speed factor (replay): {cudagraph_replay_fps / baseline_fps:.2f}x")
+    print(f"{'='*50}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,94 @@
+import os
+import time
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from inference_models import AutoModel
+
+DEVICE = os.environ.get("DEVICE", "cuda:0")
+CYCLES = int(os.environ.get("CYCLES", "10_000"))
+WARMUP = int(os.environ.get("WARMUP", "50"))
+RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100"))
+
+os.environ["USE_TRT_CUDA_GRAPHS"] = "True"
+
+BATCH_SIZES = [1, 2, 3]
+
+
+def main() -> None:
+
+    model = AutoModel.from_pretrained(
+        model_id_or_path="yolov8n-640",
+        device=torch.device(DEVICE),
+        backend="trt",
+        batch_size=(1, max(BATCH_SIZES)),
+    )
+
+    image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
+    pre_processed_single, _ = model.pre_process(image)
+
+    batches = {
+        bs: pre_processed_single.repeat(bs, 1, 1, 1) for bs in BATCH_SIZES
+    }
+
+    # ── Warmup ──────────────────────────────────────────────────────────
+    for _ in range(WARMUP):
+        for batch in batches.values():
+            model.forward(batch, use_cuda_graph=False)
+            model.forward(batch, use_cuda_graph=True)
+
+    bs_label = "/".join(str(bs) for bs in BATCH_SIZES)
+
+    # ── (1) Cycling batch sizes, no CUDA graphs ─────────────────────────
+    print(f"Timing without CUDA graphs, cycling bs={bs_label}...")
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(CYCLES):
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=False)
+    torch.cuda.synchronize()
+    baseline_fps = CYCLES / (time.perf_counter() - start)
+
+    # ── (2) Cycling batch sizes, CUDA graphs with forced recapture ──────
+    print(
+        f"Timing with CUDA graph recapture every iteration, cycling bs={bs_label} "
+        f"({RECAPTURE_CYCLES} iters)..."
+    )
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(RECAPTURE_CYCLES):
+        model._trt_cuda_graph_cache.cache.clear()
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=True)
+    torch.cuda.synchronize()
+    recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start)
+
+    # ── (3) Cycling batch sizes, CUDA graphs with normal caching ────────
+    model._trt_cuda_graph_cache.cache.clear()
+    for batch in batches.values():
+        model.forward(batch, use_cuda_graph=True)
+
+    print(f"Timing with CUDA graph cache replay, cycling bs={bs_label}...")
+    torch.cuda.synchronize()
+    start = time.perf_counter()
+    for i in range(CYCLES):
+        batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
+        model.forward(batch, use_cuda_graph=True)
+    torch.cuda.synchronize()
+    replay_fps = CYCLES / (time.perf_counter() - start)
+
+    # ── Results ─────────────────────────────────────────────────────────
+    print(f"\n{'='*60}")
+    print(f"  yolov8n-640 TRT — cycling batch sizes {BATCH_SIZES}")
+    print(f"  {CYCLES} iterations (recapture: {RECAPTURE_CYCLES})")
+    print(f"{'='*60}")
+    print(f"  No CUDA graphs:          {baseline_fps:>8.1f} fwd/s")
+    print(f"  CUDA graph recapture:    {recapture_fps:>8.1f} fwd/s  ({recapture_fps / baseline_fps:.2f}x)")
+    print(f"  CUDA graph replay:       {replay_fps:>8.1f} fwd/s  ({replay_fps / baseline_fps:.2f}x)")
+    print(f"{'='*60}")
+
+
+if __name__ == "__main__":
+    main()
@@ -88,6 +88,11 @@
     "ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA"
 )
 
+USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env(
+    variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND",
+    default=False,
+)
+
 # General model parameters defaults
 
 INFERENCE_MODELS_DEFAULT_CONFIDENCE = get_float_from_env(