Skip to content
Merged
Show file tree
Hide file tree
Changes from 50 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
6f6c44e
pass TRT graph state up and dwon call stack and acache in RFDetrObjDe…
mkaic Jan 23, 2026
549ca10
actually passing it up and down the stack
mkaic Jan 23, 2026
6412efe
three-branch solution
mkaic Jan 23, 2026
08888e3
avoid breaking things due to chagne in infer_with_trt_engine API
mkaic Jan 23, 2026
adda4aa
update unpacking in the rest of the TRT.py files
mkaic Jan 23, 2026
97fdcf0
clean up profiling script
mkaic Jan 23, 2026
470addb
remove tqdm from profiling script
mkaic Jan 23, 2026
8cca264
format
mkaic Jan 23, 2026
5b7d0a5
allow flag to be passed to rfdetr-seg models even though there don't…
mkaic Jan 23, 2026
a27ae37
reduce number of diffed files
mkaic Jan 23, 2026
f1a6afb
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Jan 23, 2026
04c015a
don't rename existing function
mkaic Jan 23, 2026
ac50a1a
add proper integration test and simplify profiling script
mkaic Jan 23, 2026
c1c1329
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Jan 23, 2026
9512229
profile how long it takes to capture cuda graph
mkaic Jan 23, 2026
d81b7d5
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Jan 28, 2026
1ebe492
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Feb 3, 2026
d5b51f9
add LRU (shape, device, dtype) caching for CUDA graphs
mkaic Feb 3, 2026
dbd45f9
add USE_CUDA_GRAPHS_FOR_TRT_BACKEND environment variable which defaul…
mkaic Feb 3, 2026
9502b8e
fix bug in profiling script
mkaic Feb 3, 2026
320fdef
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Feb 9, 2026
cb70538
use yolov8 with dynamic batch size to test shape caching for CUDA graphs
mkaic Feb 10, 2026
6b1d430
add instance seg tests
mkaic Feb 10, 2026
7c23300
update conftest
mkaic Feb 10, 2026
a27c80c
add batch-size-cycling profiling for TRT cudagraphs with yolov8
mkaic Feb 10, 2026
14a45ea
Merge branch 'feature/rfdetr-trt-use-cudagraphs' of github.com:robofl…
mkaic Feb 10, 2026
212b2d6
fix failing test
mkaic Feb 10, 2026
4204f4f
first stab at responding to Pawel's feedback
mkaic Feb 11, 2026
51f191c
working on memory profiling for cudagraphs
mkaic Feb 11, 2026
a80a572
simplify memory profiling script
mkaic Feb 11, 2026
845fabd
tweaks
mkaic Feb 11, 2026
3294cae
update tests to work with the new cache
mkaic Feb 11, 2026
31bb420
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Feb 11, 2026
bbb2540
thanks for the PR review, Claude
mkaic Feb 11, 2026
4eb23fc
see effect of cache size on vram profile script
mkaic Feb 11, 2026
aa87393
reduce default cache size to 16 after seeing memory usage
mkaic Feb 11, 2026
b5c1f6b
make style
mkaic Feb 11, 2026
a386f3b
update default and fix profiling script
mkaic Feb 11, 2026
5f4d3ea
fix imports in trt tests
mkaic Feb 11, 2026
5ed39f0
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 3, 2026
3f3be28
further merge conflict resolution
mkaic Mar 3, 2026
24b8ed4
Revert accidental formatting changes unrelated to branch
mkaic Mar 3, 2026
574e684
set this feature flag to false by default
mkaic Mar 3, 2026
c2ab80c
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 3, 2026
dfeb03d
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 4, 2026
370095e
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 4, 2026
d0f699a
resolve merge conflicts
mkaic Mar 10, 2026
077732d
fix cache profiling script
mkaic Mar 10, 2026
95c1090
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 11, 2026
1257f87
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
mkaic Mar 13, 2026
04e4d51
Merge branch 'main' into feature/rfdetr-trt-use-cudagraphs
PawelPeczek-Roboflow Mar 18, 2026
b7ea2a0
Add changes to TRT CUDA Graphs cache
PawelPeczek-Roboflow Mar 18, 2026
44030c2
Fix baseline TRT tests
PawelPeczek-Roboflow Mar 18, 2026
917def0
Bump version
PawelPeczek-Roboflow Mar 18, 2026
002a4e4
Extend tests with multi-forward-pass cases to see if predictions matc…
PawelPeczek-Roboflow Mar 18, 2026
6648c5c
Adjust tests and add docs
PawelPeczek-Roboflow Mar 18, 2026
f4a2788
Adjust docs
PawelPeczek-Roboflow Mar 18, 2026
a820aae
Adjust docs
PawelPeczek-Roboflow Mar 18, 2026
4a9b62b
Add more docs
PawelPeczek-Roboflow Mar 18, 2026
f9aeec8
Fix GH workflow
PawelPeczek-Roboflow Mar 18, 2026
3e6dd5c
Enforce replay after cuda graph is recorded to get actual results
PawelPeczek-Roboflow Mar 18, 2026
ba4f5f2
Alter YOLONAS tests to ensure repeatable predictions with warmup
PawelPeczek-Roboflow Mar 18, 2026
d10ecfb
Fix imports in docscrings
PawelPeczek-Roboflow Mar 18, 2026
4e89392
Bump version
PawelPeczek-Roboflow Mar 18, 2026
1d4e961
Bump version of inference-models in inference requirements
PawelPeczek-Roboflow Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 185 additions & 0 deletions inference_models/development/profiling/profile_cudagraph_vram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""Profile GPU and CPU memory usage as CUDA graphs are cached and evicted.

Loads yolov8n-640 as a TRT model with dynamic batch size, runs forward passes
with random batch sizes, and after each step records both GPU VRAM
(driver-level) and process CPU RSS. The cache capacity is smaller than the
number of distinct batch sizes, so eviction is exercised and memory usage
should plateau.

Example invocation:
python profile_cudagraph_vram.py \
--device cuda:0 \
--num-steps 64 \
--max-batch-size 16 \
--cache-capacity 16 \
--output vram_sequential.png

python profile_cudagraph_vram.py \
--device cuda:0 \
--num-steps 64 \
--max-batch-size 16 \
--cache-capacity 16 \
--shuffle \
--output vram_shuffle.png

python profile_cudagraph_vram.py \
--device cuda:0 \
--shuffle \
--num-steps 64 \
--max-batch-size 16 \
--cache-capacity 8 \
--output vram_shuffle_eviction.png

python profile_cudagraph_vram.py \
--device cuda:0 \
--shuffle \
--num-steps 64 \
--max-batch-size 2 \
--cache-capacity 2 \
--output vram_two_batch_sizes.png
"""

import argparse
import gc
import os
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import torch

from inference_models import AutoModel
from inference_models.models.common.trt import TRTCudaGraphLRUCache

MODEL_ID = "yolov8n-640"
MB = 1024 ** 2


def gpu_used_bytes(device: torch.device) -> int:
free, total = torch.cuda.mem_get_info(device)
return total - free


def cpu_rss_bytes() -> int:
with open(f"/proc/{os.getpid()}/statm") as f:
pages = int(f.read().split()[1])
return pages * os.sysconf("SC_PAGE_SIZE")


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Profile GPU + CPU memory vs. number of cached CUDA graphs.",
)
parser.add_argument("--device", type=str, default="cuda:0")
parser.add_argument("--max-batch-size", type=int, default=16)
parser.add_argument("--cache-capacity", type=int, default=8)
parser.add_argument("--num-steps", type=int, default=32)
parser.add_argument("--shuffle", action="store_true", help="Randomize batch size order instead of sequential cycling.")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--output", type=str, default=None)
return parser.parse_args()


def main() -> None:
args = parse_args()
device = torch.device(args.device)

rng = random.Random(args.seed)

model = AutoModel.from_pretrained(
model_id_or_path=MODEL_ID,
device=device,
backend="trt",
batch_size=(1, args.max_batch_size),
cuda_graph_cache_capacity=args.cache_capacity,
)

image = (np.random.rand(640, 640, 3) * 255).astype(np.uint8)
single_preprocessed, _ = model.pre_process(image)

model.forward(single_preprocessed, use_cuda_graph=False)
gc.collect()
torch.cuda.synchronize(device)
torch.cuda.empty_cache()

baseline_gpu = gpu_used_bytes(device)
baseline_cpu = cpu_rss_bytes()

model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(
capacity=args.cache_capacity,
)

if args.shuffle:
batch_size_sequence = [
rng.randint(1, args.max_batch_size) for _ in range(args.num_steps)
]
else:
all_sizes = list(range(1, args.max_batch_size + 1))
batch_size_sequence = [
all_sizes[i % len(all_sizes)] for i in range(args.num_steps)
]

batch_sizes = []
cumulative_gpu_mb = []
cumulative_cpu_mb = []

for i, bs in enumerate(batch_size_sequence):
batched = single_preprocessed.expand(bs, -1, -1, -1).contiguous()
output = model.forward(batched, use_cuda_graph=True)
del output
gc.collect()
torch.cuda.synchronize(device)

gpu = gpu_used_bytes(device)
cpu = cpu_rss_bytes()
cache_size = len(model._trt_cuda_graph_cache.cache)

batch_sizes.append(bs)
cumulative_gpu_mb.append((gpu - baseline_gpu) / MB)
cumulative_cpu_mb.append((cpu - baseline_cpu) / MB)

print(
f"[{i + 1}/{args.num_steps}] bs={bs:>2d} | "
f"cache: {cache_size}/{args.cache_capacity} | "
f"GPU: {cumulative_gpu_mb[-1]:>7.1f} MB | "
f"CPU: {cumulative_cpu_mb[-1]:>7.1f} MB"
)

mode = "shuffle" if args.shuffle else "sequential"
autogenerated_name = f"vram_{MODEL_ID}_cap{args.cache_capacity}_{mode}.png"
output_path = Path(args.output) if args.output else Path(autogenerated_name)

fig, ax = plt.subplots(figsize=(14, 6))
fig.suptitle(
f"Memory vs. Step (cache capacity={args.cache_capacity}, "
f"batch sizes 1-{args.max_batch_size}) -- {MODEL_ID}",
fontsize=14,
)

steps = np.arange(len(batch_sizes))

ax.plot(steps, cumulative_gpu_mb, color="steelblue", marker=".", label="GPU VRAM")
ax.plot(steps, cumulative_cpu_mb, color="seagreen", marker=".", label="CPU RSS")
ax.set_ylabel("Memory above baseline (MB)")
ax.set_xlabel("Step")
for i, bs in enumerate(batch_sizes):
ax.annotate(
str(bs), (i, cumulative_gpu_mb[i]),
textcoords="offset points", xytext=(0, 6),
fontsize=6, ha="center", color="steelblue",
)
ax.legend()

plt.tight_layout()
fig.savefig(output_path, dpi=150)
print(f"\nPlot saved to {output_path}")

print(f"\nFinal GPU VRAM above baseline: {cumulative_gpu_mb[-1]:.1f} MB")
print(f"Final CPU RSS above baseline: {cumulative_cpu_mb[-1]:.1f} MB")
print(f"Peak GPU VRAM above baseline: {max(cumulative_gpu_mb):.1f} MB")
print(f"Cache entries at end: {cache_size}/{args.cache_capacity}")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import time

import cv2
import numpy as np
import torch
from tqdm import tqdm

from inference_models import AutoModel
from inference_models.models.common.trt import TRTCudaGraphLRUCache

IMAGE_PATH = os.environ.get("IMAGE_PATH", None)
DEVICE = os.environ.get("DEVICE", "cuda:0")
CYCLES = int(os.environ.get("CYCLES", "10_000"))
WARMUP = int(os.environ.get("WARMUP", "50"))


def main() -> None:

model = AutoModel.from_pretrained(
model_id_or_path="rfdetr-nano", device=torch.device(DEVICE), backend="trt"
)

if IMAGE_PATH is not None:
image = cv2.imread(IMAGE_PATH)
else:
image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)

pre_processed, _ = model.pre_process(image)

for _ in range(WARMUP):
model.forward(pre_processed, use_cuda_graph=False)
model.forward(pre_processed, use_cuda_graph=True)

print("Timing without CUDA graphs...")
start = time.perf_counter()
for _ in range(CYCLES):
model.forward(pre_processed, use_cuda_graph=False)
baseline_fps = CYCLES / (time.perf_counter() - start)

print("Timing with forced CUDA graph recapture each step...")
start = time.perf_counter()
for _ in range(100): # not using CYCLES here bc this is wayyyy slower than the non-graph or the replay modes
model._trt_cuda_graph_cache = TRTCudaGraphLRUCache(capacity=16)
model.forward(pre_processed, use_cuda_graph=True)

cudagraph_recapture_fps = 100 / (time.perf_counter() - start)

print("Timing with CUDA graph caching and replaying...")
model.forward(pre_processed, use_cuda_graph=True) # initial capture
start = time.perf_counter()
for _ in range(CYCLES):
model.forward(pre_processed, use_cuda_graph=True)
cudagraph_replay_fps = CYCLES / (time.perf_counter() - start)

print(f"\n{'='*50}")
print(f"Forward pass FPS (no CUDA graphs): {baseline_fps:.1f}")
print(f"Forward pass FPS (CUDA graphs recapture): {cudagraph_recapture_fps:.1f}")
print(f"Speed factor (recapture): {cudagraph_recapture_fps / baseline_fps:.2f}x")
print(f"Forward pass FPS (CUDA graphs replay): {cudagraph_replay_fps:.1f}")
print(f"Speed factor (replay): {cudagraph_replay_fps / baseline_fps:.2f}x")
print(f"{'='*50}")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import time

import numpy as np
import torch
from tqdm import tqdm

from inference_models import AutoModel

DEVICE = os.environ.get("DEVICE", "cuda:0")
CYCLES = int(os.environ.get("CYCLES", "10_000"))
WARMUP = int(os.environ.get("WARMUP", "50"))
RECAPTURE_CYCLES = int(os.environ.get("RECAPTURE_CYCLES", "100"))

os.environ["USE_TRT_CUDA_GRAPHS"] = "True"

BATCH_SIZES = [1, 2, 3]


def main() -> None:

model = AutoModel.from_pretrained(
model_id_or_path="yolov8n-640",
device=torch.device(DEVICE),
backend="trt",
batch_size=(1, max(BATCH_SIZES)),
)

image = (np.random.rand(224, 224, 3) * 255).astype(np.uint8)
pre_processed_single, _ = model.pre_process(image)

batches = {
bs: pre_processed_single.repeat(bs, 1, 1, 1) for bs in BATCH_SIZES
}

# ── Warmup ──────────────────────────────────────────────────────────
for _ in range(WARMUP):
for batch in batches.values():
model.forward(batch, use_cuda_graph=False)
model.forward(batch, use_cuda_graph=True)

bs_label = "/".join(str(bs) for bs in BATCH_SIZES)

# ── (1) Cycling batch sizes, no CUDA graphs ─────────────────────────
print(f"Timing without CUDA graphs, cycling bs={bs_label}...")
torch.cuda.synchronize()
start = time.perf_counter()
for i in range(CYCLES):
batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
model.forward(batch, use_cuda_graph=False)
torch.cuda.synchronize()
baseline_fps = CYCLES / (time.perf_counter() - start)

# ── (2) Cycling batch sizes, CUDA graphs with forced recapture ──────
print(
f"Timing with CUDA graph recapture every iteration, cycling bs={bs_label} "
f"({RECAPTURE_CYCLES} iters)..."
)
torch.cuda.synchronize()
start = time.perf_counter()
for i in range(RECAPTURE_CYCLES):
model._trt_cuda_graph_cache.cache.clear()
batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
model.forward(batch, use_cuda_graph=True)
torch.cuda.synchronize()
recapture_fps = RECAPTURE_CYCLES / (time.perf_counter() - start)

# ── (3) Cycling batch sizes, CUDA graphs with normal caching ────────
model._trt_cuda_graph_cache.cache.clear()
for batch in batches.values():
model.forward(batch, use_cuda_graph=True)

print(f"Timing with CUDA graph cache replay, cycling bs={bs_label}...")
torch.cuda.synchronize()
start = time.perf_counter()
for i in range(CYCLES):
batch = batches[BATCH_SIZES[i % len(BATCH_SIZES)]]
model.forward(batch, use_cuda_graph=True)
torch.cuda.synchronize()
replay_fps = CYCLES / (time.perf_counter() - start)

# ── Results ─────────────────────────────────────────────────────────
print(f"\n{'='*60}")
print(f" yolov8n-640 TRT — cycling batch sizes {BATCH_SIZES}")
print(f" {CYCLES} iterations (recapture: {RECAPTURE_CYCLES})")
print(f"{'='*60}")
print(f" No CUDA graphs: {baseline_fps:>8.1f} fwd/s")
print(f" CUDA graph recapture: {recapture_fps:>8.1f} fwd/s ({recapture_fps / baseline_fps:.2f}x)")
print(f" CUDA graph replay: {replay_fps:>8.1f} fwd/s ({replay_fps / baseline_fps:.2f}x)")
print(f"{'='*60}")


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions inference_models/inference_models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@
"ALLOW_LOCAL_STORAGE_ACCESS_FOR_REFERENCE_DATA"
)

USE_CUDA_GRAPHS_FOR_TRT_BACKEND = get_boolean_from_env(
variable_name="USE_CUDA_GRAPHS_FOR_TRT_BACKEND",
default=False,
)

# General model parameters defaults

INFERENCE_MODELS_DEFAULT_CONFIDENCE = get_float_from_env(
Expand Down
Loading
Loading