Specify engine path for TRT evaluation

ajrasane · ajrasane · commit 37c625035a49 · 2025-09-10T00:03:46.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/onnx_ptq/evaluate.py b/examples/onnx_ptq/evaluate.py
@@ -35,6 +35,12 @@ def main():
         help="""Path to the image classification ONNX model with input shape of
         [batch_size,3,224,224] and output shape of [1,1000]""",
     )
+    parser.add_argument(
+        "--engine_path",
+        type=str,
+        required=True,
+        help="Path to the TensorRT engine",
+    )
     parser.add_argument(
         "--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
     )
@@ -80,7 +86,10 @@ def main():
     client = RuntimeRegistry.get(deployment)
 
     # Compile the ONNX model to TRT engine and create the device model
-    compiled_model = client.ir_to_compiled(onnx_bytes)
+    compilation_args = {
+        "engine_path": args.engine_path,
+    }
+    compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
     device_model = DeviceModel(client, compiled_model, metadata={})
 
     top1_accuracy, top5_accuracy = 0.0, 0.0
diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
@@ -122,6 +122,7 @@ def _update_dynamic_shapes(dynamic_shapes: dict, cmd: list[str]) -> None:
 def build_engine(
     onnx_bytes: OnnxBytes,
     trt_mode: str = TRTMode.FLOAT32,
+    engine_path: Path | None = None,
     calib_cache: str | None = None,
     dynamic_shapes: dict | None = None,
     plugin_config: dict | None = None,
@@ -133,6 +134,7 @@ def build_engine(
 
     Args:
         onnx_bytes: Data of the ONNX model stored as an OnnxBytes object.
+        engine_path: Path to save the TensorRT engine.
         trt_mode: The precision with which the TensorRT engine will be built. Supported modes are:
             - TRTMode.FLOAT32
             - TRTMode.FLOAT16
@@ -202,22 +204,25 @@ def _build_command(
 
     def _setup_files_and_paths(
         tmp_dir_path: Path,
+        engine_path: Path | None,
     ) -> tuple[Path, Path, Path | None, Path | None, Path]:
         tmp_onnx_dir = tmp_dir_path / "onnx"
         onnx_bytes.write_to_disk(str(tmp_onnx_dir))
         onnx_path = tmp_onnx_dir / f"{onnx_bytes.model_name}.onnx"
 
         final_output_dir = Path(output_dir or Path(gettempdir()) / DEFAULT_ARTIFACT_DIR)
         final_output_dir.mkdir(parents=True, exist_ok=True)
-        engine_path = final_output_dir / f"{onnx_bytes.model_name}.engine"
+        engine_path = (
+            engine_path if engine_path else final_output_dir / f"{onnx_bytes.model_name}.engine"
+        )
         calib_cache_path = final_output_dir / "calib_cache" if calib_cache else None
         timing_cache_path = final_output_dir / "timing.cache"
 
         return onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir
 
     with TemporaryDirectory() as tmp_dir:
         onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir = (
-            _setup_files_and_paths(Path(tmp_dir))
+            _setup_files_and_paths(Path(tmp_dir), engine_path)
         )
         cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path)
 
diff --git a/modelopt/torch/_deploy/_runtime/trt_client.py b/modelopt/torch/_deploy/_runtime/trt_client.py
@@ -74,7 +74,8 @@ def _ir_to_compiled(
 
         Args:
             ir_bytes: The ONNX model bytes.
-            compilation_args: A dictionary of compilation arguments. Supported args: dynamic_shapes, plugin_config.
+            compilation_args: A dictionary of compilation arguments.
+                The following arguments are supported: dynamic_shapes, plugin_config, engine_path.
 
         Returns:
             The compiled TRT engine bytes.
@@ -86,6 +87,7 @@ def _ir_to_compiled(
             onnx_bytes,
             dynamic_shapes=compilation_args.get("dynamic_shapes"),  # type: ignore[union-attr]
             plugin_config=compilation_args.get("plugin_config"),  # type: ignore[union-attr]
+            engine_path=compilation_args.get("engine_path"),  # type: ignore[union-attr]
             trt_mode=self.deployment["precision"],
             verbose=(self.deployment.get("verbose", "false").lower() == "true"),
         )
diff --git a/tests/examples/test_onnx_ptq.sh b/tests/examples/test_onnx_ptq.sh
@@ -129,7 +129,8 @@ for model_path in "${model_paths[@]}"; do
             --onnx_path=$model_dir/fp16/model.onnx \
             --quantize_mode=$quant_mode \
             --calibration_data=$calib_data_path \
-            --output_path=$model_dir/$quant_mode/model.quant.onnx &
+            --output_path=$model_dir/$quant_mode/model.quant.onnx \
+            --calibration_eps=cuda:0 &
         pids+=($!)
     done
 
@@ -161,22 +162,29 @@ for model_path in "${model_paths[@]}"; do
         quant_mode="${all_modes[$i]}"
         gpu_id=$((i % nvidia_gpu_count))
 
-        if [ "$quant_mode" == "fp16" ] || [ "$quant_mode" == "int8_iq" ]; then
+        if [ "$quant_mode" == "fp16" ]; then
             eval_model_path=$model_dir/fp16/model.onnx
+            engine_path=$model_dir/fp16/model.engine
+        elif [ "$quant_mode" == "int8_iq" ]; then
+            eval_model_path=$model_dir/fp16/model.onnx
+            engine_path=$model_dir/int8_iq/model.engine
         else
             eval_model_path=$model_dir/$quant_mode/model.quant.onnx
+            engine_path=$model_dir/$quant_mode/model.quant.engine
         fi
 
         echo "Starting evaluation of $model_name for mode: $quant_mode on GPU $gpu_id"
         if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then
             CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
                 --onnx_path=$eval_model_path \
+                --engine_path=$engine_path \
                 --model_name="${timm_model_name[$model_name]}" \
                 --quantize_mode=$quant_mode \
                 --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv &
         else
             CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
                 --onnx_path=$eval_model_path \
+                --engine_path=$engine_path \
                 --imagenet_path=$imagenet_path \
                 --eval_data_size=$calib_size \
                 --batch_size $batch_size \