feat: add support for compiling onnx models with trtexec

David Pichler · David Pichler · commit e4a673037ff8 · 2026-03-01T15:20:43.000-06:00
diff --git a/examples/trt_compile/.gitignore b/examples/trt_compile/.gitignore
@@ -0,0 +1,2 @@
+*.onnx
+model-repo
diff --git a/examples/trt_compile/README.md b/examples/trt_compile/README.md
@@ -0,0 +1,96 @@
+# TensorRT Compilation — ResNet-18 Image Classification
+
+This example compiles a pretrained ResNet-18 ONNX model to a TensorRT engine during the build phase using `trt_compile`, then serves it on Triton Inference Server.
+
+During build, `tsbk` will:
+
+1. Download the ONNX model artifact from MLflow
+2. Compile it to a `.plan` file using `trtexec` with fp16 precision (via Docker or a Kubernetes Job)
+3. Set the backend to `tensorrt` in the generated `config.pbtxt`
+4. Cache the compiled engine locally so subsequent builds skip compilation
+
+## Prerequisites
+
+- Install example requirements:
+
+```bash
+pip install -r requirements.txt
+```
+
+- **Docker with GPU access** (for local compilation), or
+- **Kubernetes cluster with GPU nodes** + `TSBK_S3_PREFIX` env var set (for remote compilation)
+
+## Setup
+
+Export a pretrained ResNet-18 to ONNX and register it with MLflow:
+
+```bash
+python create-model.py
+```
+
+This exports `resnet18.onnx` with:
+- Input: `image` — `[batch, 3, 224, 224]` float32 (ImageNet-normalized RGB)
+- Output: `logits` — `[batch, 1000]` float32 (class scores)
+
+## Build and Run (local GPU via Docker)
+
+```bash
+python server.py --test
+```
+
+This will:
+- Build the model repo, compiling the ONNX model to TensorRT with fp16 precision
+- Launch Triton server in a Docker container with GPU access
+- Run the MLflow registered input example as a test case
+- Stop the server
+
+## Build and Run (remote GPU via Kubernetes)
+
+If you don't have a local GPU but have access to a Kubernetes cluster with GPU nodes, pass `--gpu-name` to target a specific GPU type via Karpenter:
+
+```bash
+export TSBK_S3_PREFIX=s3://your-bucket/tsbk-cache
+python server.py --test --gpu-name a10g
+```
+
+The `--gpu-name` value maps to a Karpenter node selector (`karpenter.k8s.aws/instance-gpu-name`) so the compilation job is scheduled on the correct hardware.
+
+## Build Only
+
+```bash
+python server.py --build-only
+```
+
+After building, the model repo will look like:
+
+```
+model-repo/
+└── resnet18-trt/
+    └── resnet18/
+        ├── config.pbtxt    # backend: "tensorrt", max_batch_size: 8
+        └── 1/
+            └── model.plan   # compiled TensorRT engine (fp16)
+```
+
+## SDK Usage
+
+The key addition compared to the standard SDK example is the `trt_compile` dict on the model version:
+
+```python
+tsbk.TritonModel(
+    max_batch_size=8,
+    versions=[
+        tsbk.TritonModelVersion(
+            artifact_uri="models:/resnet18-imagenet/1",
+            trt_compile={
+                "enabled": True,
+                "precision": "fp16",          # optional: fp16, int8, best
+                "workspace_size": 4096,       # optional: max workspace in MB
+                "gpu_name": "a10g",           # optional: Karpenter GPU node selector for K8s
+                "trt_image": "nvcr.io/...",   # optional: override TRT container image
+                "extra_args": "--verbose",    # optional: raw trtexec flags
+            },
+        )
+    ],
+)
+```
diff --git a/examples/trt_compile/create-model.py b/examples/trt_compile/create-model.py
@@ -0,0 +1,39 @@
+import mlflow
+import numpy as np
+import onnx
+import torch
+import torchvision.models as models
+
+# Load a pretrained ResNet-18 for image classification
+resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
+resnet.eval()
+
+# Export to ONNX with a batch dimension and standard ImageNet input shape
+# Input: [batch, 3, 224, 224] RGB image normalized to ImageNet stats
+# Output: [batch, 1000] class logits
+model_path = "resnet18.onnx"
+dummy_input = torch.randn(1, 3, 224, 224)
+torch.onnx.export(
+    resnet,
+    dummy_input,
+    model_path,
+    input_names=["image"],
+    output_names=["logits"],
+    dynamic_axes={
+        "image": {0: "batch_size"},
+        "logits": {0: "batch_size"},
+    },
+    opset_version=17,
+)
+
+onnx_model = onnx.load(model_path)
+
+# Log model to MLflow with a sample input (a random "image" tensor)
+with mlflow.start_run() as run:
+    mlflow.onnx.log_model(
+        onnx_model,
+        artifact_path="resnet18",
+        registered_model_name="resnet18-imagenet",
+        input_example={"image": np.random.randn(1, 3, 224, 224).astype(np.float32)},
+    )
+    print(f"Model registered: models:/resnet18-imagenet/1 (run_id={run.info.run_id})")
diff --git a/examples/trt_compile/requirements.txt b/examples/trt_compile/requirements.txt
@@ -0,0 +1,5 @@
+tsbk
+onnx
+torch
+torchvision
+mlflow-skinny
diff --git a/examples/trt_compile/server.py b/examples/trt_compile/server.py
@@ -0,0 +1,84 @@
+import argparse
+
+import tsbk
+
+
+def model_repo(model_repo_path: str, artifact_uri: str, gpu_name: str | None = None) -> tsbk.TritonModelRepo:
+    """Build a model repo that compiles a ResNet-18 ONNX model to TensorRT.
+
+    The trt_compile config on the version tells tsbk to:
+      1. Download the ONNX model artifact
+      2. Compile it to a TensorRT .plan file using trtexec (via Docker or K8s)
+      3. Replace the .onnx with the .plan and set backend to tensorrt
+
+    The compiled engine is cached under TSBK_DIR/trt_engines/ — subsequent
+    builds with the same ONNX model and compile params skip compilation.
+
+    When gpu_name is set, Kubernetes compilation uses it as a Karpenter node
+    selector (karpenter.k8s.aws/instance-gpu-name) to schedule on the right
+    GPU hardware. Requires TSBK_S3_PREFIX to be set for artifact transfer.
+    """
+    trt_compile = {
+        "enabled": True,
+        "precision": "fp16",
+    }
+    if gpu_name:
+        trt_compile["gpu_name"] = gpu_name
+
+    return tsbk.TritonModelRepo(
+        "resnet18-trt",
+        path=model_repo_path,
+        models={
+            "resnet18": tsbk.TritonModel(
+                max_batch_size=8,
+                versions=[
+                    tsbk.TritonModelVersion(
+                        artifact_uri=artifact_uri,
+                        trt_compile=trt_compile,
+                    )
+                ],
+            )
+        },
+    )
+
+
+def main(args):
+    repo = model_repo(args.model_repo, args.model_artifact_uri, gpu_name=args.gpu_name)
+    repo.build()
+
+    if args.build_only:
+        return
+
+    repo.run(detach=args.test, gpus=True)
+
+    if args.test:
+        repo.test(url=repo.http_url)
+        repo.stop()
+        print("Tests passed!")
+        return
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build and serve a TensorRT-compiled ResNet-18 with tsbk")
+    parser.add_argument(
+        "--model_artifact_uri",
+        type=str,
+        default="models:/resnet18-imagenet/1",
+        help="MLflow model URI for the ONNX ResNet-18",
+    )
+    parser.add_argument("--model-repo", type=str, default="./model-repo", help="Path to the model repository")
+    parser.add_argument(
+        "--build-only", action="store_true", help="Only build the model repository without starting the server"
+    )
+    parser.add_argument("--test", action="store_true", help="Run in test mode")
+    parser.add_argument(
+        "--gpu-name",
+        type=str,
+        default=None,
+        help="Target GPU for compilation, used as Karpenter node selector (e.g. a10g, t4)",
+    )
+    args = parser.parse_args()
+
+    assert not (args.build_only and args.test), "Cannot use --build-only and --test together"
+
+    main(args)
diff --git a/src/tsbk/model.py b/src/tsbk/model.py
@@ -206,6 +206,19 @@ def init(
                 "Please specify a model backend via triton config or by passing the backend directly."
             )
 
+        # Override backend to tensorrt if any version has trt_compile enabled
+        has_trt_compile = any(
+            mv.trt_compile and mv.trt_compile.get("enabled")
+            for mv in self.versions
+            if hasattr(mv, "trt_compile") and mv.trt_compile
+        )
+        if has_trt_compile:
+            if self.backend not in ["tensorrt", "onnxruntime"]:
+                raise ValueError(
+                    "Must specify tensorrt or onnxruntime backend if trt_compile is enabled for any model version. "
+                )
+            self.backend = "tensorrt"
+
         if self.python_version and self.backend not in {"python", "mlflow"}:
             raise ValueError("python_version can only be specified for python models")
 
diff --git a/src/tsbk/model_version.py b/src/tsbk/model_version.py
@@ -10,6 +10,7 @@
 from tsbk.utils import link_or_copy
 from tsbk.utils.dbx import download_mlflow_model, get_input_example_from_model
 from tsbk.utils.s3 import download_s3_path, s3_path_exists
+from tsbk.utils.trtexec import _find_onnx_file, build_trt_engine
 
 
 class TritonModelVersion:
@@ -19,17 +20,20 @@ def __init__(
         python_model_file: str | None = None,
         version: int | None = None,
         test_cases: list[TestCase | dict] | None = None,
+        trt_compile: dict | None = None,
     ):
         """A Triton model version.
 
         Args:
             artifact_uri: The URI of the model artifact, which can be an MLflow model or an S3 object.
             python_model_file: The path to the Python model file, which is required for Python models.
             version: The version number of the model.
+            trt_compile: Configuration for compiling ONNX models to TensorRT engines.
         """
         self.artifact_uri = artifact_uri
         self.python_model_file = python_model_file
         self.version = version
+        self.trt_compile = trt_compile
         self.test_cases = test_cases or []
         self.test_cases = [
             TestCase(**test_case) if isinstance(test_case, dict) else test_case for test_case in self.test_cases
@@ -145,8 +149,32 @@ def build(self) -> None:
                         copy_func(dst_path=output_file_path)
 
                 case "tensorrt":
-                    output_file_path = self.path.joinpath("model.plan").as_posix()
-                    copy_func(dst_path=output_file_path)
+                    if self.trt_compile and self.trt_compile.get("enabled"):
+                        # Get the ONNX model from cache (no copy into version dir)
+                        if source == "mlflow":
+                            cached_model = copy_func()
+                        else:
+                            cached_model = Path(self.artifact_uri)
+                            if not cached_model.exists():
+                                copy_func(dst_path=cached_model)
+
+                        # Find the .onnx file in the cached model and compile it
+                        onnx_path = _find_onnx_file(cached_model)
+                        plan_path = build_trt_engine(
+                            onnx_path=onnx_path,
+                            precision=self.trt_compile.get("precision"),
+                            workspace_size=self.trt_compile.get("workspace_size"),
+                            extra_args=self.trt_compile.get("extra_args"),
+                            trt_image=self.trt_compile.get("trt_image"),
+                            gpu_name=self.trt_compile.get("gpu_name"),
+                        )
+
+                        # Only place the compiled plan in the version directory
+                        link_or_copy(plan_path, self.path.joinpath("model.plan"))
+                    else:
+                        # Standard tensorrt: just copy the .plan file
+                        output_file_path = self.path.joinpath("model.plan").as_posix()
+                        copy_func(dst_path=output_file_path)
 
                 case _:
                     output_file_path = self.path.joinpath(self.artifact_uri.split("/")[-1])
diff --git a/src/tsbk/spec.py b/src/tsbk/spec.py
@@ -42,6 +42,23 @@ class TritonDTypeSpec(BaseModel):
     """The dimensions of the Triton data type, which can be used to specify the shape of the input or output tensors."""
 
 
+class TrtCompileSpec(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+
+    enabled: bool = False
+    """Whether to compile ONNX models to TensorRT .plan files using trtexec."""
+    trt_image: str | None = None
+    """Override the TensorRT container image. Defaults to nvcr.io/nvidia/tensorrt:{triton_version}-py3."""
+    precision: str | None = None
+    """Precision mode for trtexec: 'fp16', 'int8', 'best', or None for default (fp32)."""
+    workspace_size: int | None = None
+    """Max workspace size in MB for trtexec."""
+    extra_args: str | None = None
+    """Additional raw trtexec CLI arguments as a string."""
+    gpu_name: str | None = None
+    """Target GPU architecture for compilation. Used as Karpenter node selector for K8s scheduling (e.g., 'A10G', 'T4')."""
+
+
 class TritonModelVersionSpec(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
@@ -53,6 +70,8 @@ class TritonModelVersionSpec(BaseModel):
     """The version number of the model"""
     test_cases: list[TestCaseSpec] | None = None
     """A list of test cases for the model version, which can be used to validate the model's behavior."""
+    trt_compile: TrtCompileSpec | None = None
+    """Configuration for compiling ONNX models to TensorRT engines during build."""
 
 
 class TritonModelSpec(BaseModel):
diff --git a/src/tsbk/utils/trtexec.py b/src/tsbk/utils/trtexec.py
diff --git a/tests/unit/utils/test_trtexec.py b/tests/unit/utils/test_trtexec.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +tsbk
 +onnx
 +torch
 +torchvision
 +mlflow-skinny