Add timing cache to the evaluate API

ajrasane · ajrasane · commit 67305e414f94 · 2025-10-08T19:42:45.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/onnx_ptq/evaluate.py b/examples/onnx_ptq/evaluate.py
@@ -38,9 +38,15 @@ def main():
     parser.add_argument(
         "--engine_path",
         type=str,
-        required=True,
+        default=None,
         help="Path to the TensorRT engine",
     )
+    parser.add_argument(
+        "--timing_cache_path",
+        type=str,
+        default=None,
+        help="Path to the TensorRT timing cache",
+    )
     parser.add_argument(
         "--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
     )
@@ -81,6 +87,7 @@ def main():
     # Compile the ONNX model to TRT engine and create the device model
     compilation_args = {
         "engine_path": args.engine_path,
+        "timing_cache_path": args.timing_cache_path,
     }
     compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
     device_model = DeviceModel(client, compiled_model, metadata={})
diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
@@ -123,6 +123,7 @@ def build_engine(
     onnx_bytes: OnnxBytes,
     trt_mode: str = TRTMode.FLOAT32,
     engine_path: Path | None = None,
+    timing_cache_path: Path | None = None,
     calib_cache: str | None = None,
     dynamic_shapes: dict | None = None,
     plugin_config: dict | None = None,
@@ -135,6 +136,7 @@ def build_engine(
     Args:
         onnx_bytes: Data of the ONNX model stored as an OnnxBytes object.
         engine_path: Path to save the TensorRT engine.
+        timing_cache_path: Path to save/load the TensorRT timing cache.
         trt_mode: The precision with which the TensorRT engine will be built. Supported modes are:
             - TRTMode.FLOAT32
             - TRTMode.FLOAT16
@@ -205,6 +207,7 @@ def _build_command(
     def _setup_files_and_paths(
         tmp_dir_path: Path,
         engine_path: Path | None,
+        timing_cache_path: Path | None,
     ) -> tuple[Path, Path, Path | None, Path | None, Path]:
         tmp_onnx_dir = tmp_dir_path / "onnx"
         onnx_bytes.write_to_disk(str(tmp_onnx_dir))
@@ -219,13 +222,15 @@ def _setup_files_and_paths(
         )
         engine_path.parent.mkdir(parents=True, exist_ok=True)
         calib_cache_path = final_output_dir / "calib_cache" if calib_cache else None
-        timing_cache_path = final_output_dir / "timing.cache"
+        timing_cache_path = (
+            Path(timing_cache_path) if timing_cache_path else final_output_dir / "timing.cache"
+        )
 
         return onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir
 
     with TemporaryDirectory() as tmp_dir:
         onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir = (
-            _setup_files_and_paths(Path(tmp_dir), engine_path)
+            _setup_files_and_paths(Path(tmp_dir), engine_path, timing_cache_path)
         )
         cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path)
 
diff --git a/modelopt/torch/_deploy/_runtime/trt_client.py b/modelopt/torch/_deploy/_runtime/trt_client.py
@@ -74,7 +74,7 @@ def _ir_to_compiled(
         Args:
             ir_bytes: The ONNX model bytes.
             compilation_args: A dictionary of compilation arguments.
-                The following arguments are supported: dynamic_shapes, plugin_config, engine_path.
+                The following arguments are supported: dynamic_shapes, plugin_config, engine_path, timing_cache_path.
 
         Returns:
             The compiled TRT engine bytes.
@@ -87,6 +87,7 @@ def _ir_to_compiled(
             dynamic_shapes=compilation_args.get("dynamic_shapes"),  # type: ignore[union-attr]
             plugin_config=compilation_args.get("plugin_config"),  # type: ignore[union-attr]
             engine_path=compilation_args.get("engine_path"),  # type: ignore[union-attr]
+            timing_cache_path=compilation_args.get("timing_cache_path"),  # type: ignore[union-attr]
             trt_mode=self.deployment["precision"],
             verbose=(self.deployment.get("verbose", "false").lower() == "true"),
         )
diff --git a/tests/examples/test_onnx_ptq.sh b/tests/examples/test_onnx_ptq.sh
@@ -149,7 +149,7 @@ for model_path in "${model_paths[@]}"; do
             --quantize_mode=$quant_mode \
             --calibration_data=$calib_data_path \
             --output_path=$model_dir/$quant_mode/model.quant.onnx \
-            --calibration_eps=cuda:0
+            --calibration_eps=cuda
     done
 
     echo "Completed quantization of all modes for model: $model_name"
@@ -184,7 +184,8 @@ for model_path in "${model_paths[@]}"; do
                 --engine_path=$engine_path \
                 --model_name="${timm_model_name[$model_name]}" \
                 --engine_precision=$precision \
-                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv
+                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
+                --timing_cache_path=build/timing.cache
         else
             python evaluate.py \
                 --onnx_path=$eval_model_path \
@@ -194,7 +195,8 @@ for model_path in "${model_paths[@]}"; do
                 --batch_size $batch_size \
                 --model_name="${timm_model_name[$model_name]}" \
                 --engine_precision=$precision \
-                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv
+                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
+                --timing_cache_path=build/timing.cache
         fi
     done
 
@@ -214,4 +216,5 @@ fi
 popd
 
 
-echo "Total wall time: $(($(date +%s) - start_time)) seconds"
+total_seconds=$(($(date +%s) - start_time))
+printf "Total wall time: %02d:%02d:%02d\n" $((total_seconds/3600)) $(((total_seconds%3600)/60)) $((total_seconds%60))