Update onnx ptq test to be single threaded and make it faster (NVIDIA#415)

ajrasane · web-flow · commit c692074260f4 · 2025-10-17T18:15:14.000Z
Signed-off-by: ajrasane &lt;131806219+ajrasane@users.noreply.github.com&gt;
diff --git a/examples/onnx_ptq/evaluate.py b/examples/onnx_ptq/evaluate.py
@@ -38,9 +38,15 @@ def main():
     parser.add_argument(
         "--engine_path",
         type=str,
-        required=True,
+        default=None,
         help="Path to the TensorRT engine",
     )
+    parser.add_argument(
+        "--timing_cache_path",
+        type=str,
+        default=None,
+        help="Path to the TensorRT timing cache",
+    )
     parser.add_argument(
         "--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
     )
@@ -81,6 +87,7 @@ def main():
     # Compile the ONNX model to TRT engine and create the device model
     compilation_args = {
         "engine_path": args.engine_path,
+        "timing_cache_path": args.timing_cache_path,
     }
     compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
     device_model = DeviceModel(client, compiled_model, metadata={})
diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
@@ -123,6 +123,7 @@ def build_engine(
     onnx_bytes: OnnxBytes,
     trt_mode: str = TRTMode.FLOAT32,
     engine_path: Path | None = None,
+    timing_cache_path: Path | None = None,
     calib_cache: str | None = None,
     dynamic_shapes: dict | None = None,
     plugin_config: dict | None = None,
@@ -135,6 +136,7 @@ def build_engine(
     Args:
         onnx_bytes: Data of the ONNX model stored as an OnnxBytes object.
         engine_path: Path to save the TensorRT engine.
+        timing_cache_path: Path to save/load the TensorRT timing cache.
         trt_mode: The precision with which the TensorRT engine will be built. Supported modes are:
             - TRTMode.FLOAT32
             - TRTMode.FLOAT16
@@ -205,6 +207,7 @@ def _build_command(
     def _setup_files_and_paths(
         tmp_dir_path: Path,
         engine_path: Path | None,
+        timing_cache_path: Path | None,
     ) -> tuple[Path, Path, Path | None, Path | None, Path]:
         tmp_onnx_dir = tmp_dir_path / "onnx"
         onnx_bytes.write_to_disk(str(tmp_onnx_dir))
@@ -219,13 +222,15 @@ def _setup_files_and_paths(
         )
         engine_path.parent.mkdir(parents=True, exist_ok=True)
         calib_cache_path = final_output_dir / "calib_cache" if calib_cache else None
-        timing_cache_path = final_output_dir / "timing.cache"
+        timing_cache_path = (
+            Path(timing_cache_path) if timing_cache_path else final_output_dir / "timing.cache"
+        )
 
         return onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir
 
     with TemporaryDirectory() as tmp_dir:
         onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir = (
-            _setup_files_and_paths(Path(tmp_dir), engine_path)
+            _setup_files_and_paths(Path(tmp_dir), engine_path, timing_cache_path)
         )
         cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path)
 
diff --git a/modelopt/torch/_deploy/_runtime/trt_client.py b/modelopt/torch/_deploy/_runtime/trt_client.py
@@ -74,7 +74,7 @@ def _ir_to_compiled(
         Args:
             ir_bytes: The ONNX model bytes.
             compilation_args: A dictionary of compilation arguments.
-                The following arguments are supported: dynamic_shapes, plugin_config, engine_path.
+                The following arguments are supported: dynamic_shapes, plugin_config, engine_path, timing_cache_path.
 
         Returns:
             The compiled TRT engine bytes.
@@ -87,6 +87,7 @@ def _ir_to_compiled(
             dynamic_shapes=compilation_args.get("dynamic_shapes"),  # type: ignore[union-attr]
             plugin_config=compilation_args.get("plugin_config"),  # type: ignore[union-attr]
             engine_path=compilation_args.get("engine_path"),  # type: ignore[union-attr]
+            timing_cache_path=compilation_args.get("timing_cache_path"),  # type: ignore[union-attr]
             trt_mode=self.deployment["precision"],
             verbose=(self.deployment.get("verbose", "false").lower() == "true"),
         )
diff --git a/tests/_test_utils/onnx_quantization/lib_test_models.py b/tests/_test_utils/onnx_quantization/lib_test_models.py
@@ -124,6 +124,7 @@ def export_as_onnx(
         output_names=output_names,
         opset_version=opset,
         do_constant_folding=do_constant_folding,
+        dynamo=False,
     )
 
 
diff --git a/tests/examples/test_onnx_ptq.sh b/tests/examples/test_onnx_ptq.sh
@@ -21,7 +21,7 @@
 # It is recommended to execute this script inside the Model Optimization Toolkit TensorRT Docker container.
 # Please ensure that the ImageNet dataset is available in the container at the specified path.
 
-# Usage: ./test_onnx_ptq.sh [--no-clean] [/path/to/imagenet] [/path/to/models]
+# Usage: ./test_onnx_ptq.sh [--no-clean] [--eval] [/path/to/imagenet] [/path/to/models] [/path/to/timing_cache]
 
 set -exo pipefail
 
@@ -37,20 +37,28 @@ pushd $public_example_dir
 
 # Parse arguments
 clean_mode=true
+eval_mode=false
 imagenet_path=""
 models_folder=""
+timing_cache_path=""
 
 for arg in "$@"; do
     case $arg in
         --no-clean)
             clean_mode=false
             shift
             ;;
+        --eval)
+            eval_mode=true
+            shift
+            ;;
         *)
             if [ -z "$imagenet_path" ]; then
                 imagenet_path="$arg"
             elif [ -z "$models_folder" ]; then
                 models_folder="$arg"
+            elif [ -z "$timing_cache_path" ]; then
+                timing_cache_path="$arg"
             fi
             shift
             ;;
@@ -63,7 +71,9 @@ export TQDM_DISABLE=1
 # Setting image and model paths (contains 8 models)
 imagenet_path=${imagenet_path:-/data/imagenet/}
 models_folder=${models_folder:-/models/onnx}
-calib_size=64
+timing_cache_path=${timing_cache_path:-/models/onnx/build/timing.cache}
+calib_size=1
+eval_size=100
 batch_size=1
 
 
@@ -137,117 +147,84 @@ for model_path in "${model_paths[@]}"; do
     model_name=$(basename "$model_path" .onnx)
     model_dir=build/$model_name
 
-
-    echo "Quantizing model $model_name for all quantization modes in parallel"
-    pids=()
-    for i in "${!quant_modes[@]}"; do
-        quant_mode="${quant_modes[$i]}"
-        gpu_id=$((i % nvidia_gpu_count))
+    echo "Quantizing model $model_name for all quantization modes"
+    for quant_mode in "${quant_modes[@]}"; do
         if [ "$quant_mode" == "int8_iq" ]; then
             continue
         fi
 
-        echo "Starting quantization of $model_name for mode: $quant_mode on GPU $gpu_id"
-        CUDA_VISIBLE_DEVICES=$gpu_id python -m modelopt.onnx.quantization \
+        echo "Starting quantization of $model_name for mode: $quant_mode"
+        python -m modelopt.onnx.quantization \
             --onnx_path=$model_dir/fp16/model.onnx \
             --quantize_mode=$quant_mode \
             --calibration_data=$calib_data_path \
             --output_path=$model_dir/$quant_mode/model.quant.onnx \
-            --calibration_eps=cuda:0 &
-        pids+=($!)
-    done
-
-    # Wait for all quantization processes to complete for this model
-    error_occurred=false
-    for pid in "${pids[@]}"; do
-        if ! wait $pid; then
-            echo "ERROR: Quantization process (PID: $pid) failed"
-            error_occurred=true
-        fi
+            --calibration_eps=cuda
     done
-    if [ "$error_occurred" = true ]; then
-        echo "Stopping execution due to quantization failure for model: $model_name"
-        exit 1
-    fi
 
     echo "Completed quantization of all modes for model: $model_name"
 done
 
 
 # Evaluate the quantized models for each mode
-for model_path in "${model_paths[@]}"; do
-    model_name=$(basename "$model_path" .onnx)
-    model_dir=build/$model_name
-
-    echo "Evaluating model $model_name for all quantization modes in parallel"
-    pids=()
-    for i in "${!all_modes[@]}"; do
-        quant_mode="${all_modes[$i]}"
-        gpu_id=$((i % nvidia_gpu_count))
-
-        if [ "$quant_mode" == "fp16" ]; then
-            eval_model_path=$model_dir/fp16/model.onnx
-            engine_path=$model_dir/fp16/model.engine
-            precision="fp16"
-        elif [ "$quant_mode" == "int8_iq" ]; then
-            eval_model_path=$model_dir/fp16/model.onnx
-            engine_path=$model_dir/int8_iq/model.engine
-            precision="best"
-        else
-            eval_model_path=$model_dir/$quant_mode/model.quant.onnx
-            engine_path=$model_dir/$quant_mode/model.quant.engine
-            precision="stronglyTyped"
-        fi
+if [ "$eval_mode" = true ]; then
+    for model_path in "${model_paths[@]}"; do
+        model_name=$(basename "$model_path" .onnx)
+        model_dir=build/$model_name
+
+        echo "Evaluating model $model_name for all quantization modes"
+        for quant_mode in "${all_modes[@]}"; do
+            if [ "$quant_mode" == "fp16" ]; then
+                eval_model_path=$model_dir/fp16/model.onnx
+                engine_path=$model_dir/fp16/model.engine
+                precision="fp16"
+            elif [ "$quant_mode" == "int8_iq" ]; then
+                eval_model_path=$model_dir/fp16/model.onnx
+                engine_path=$model_dir/int8_iq/model.engine
+                precision="best"
+            else
+                eval_model_path=$model_dir/$quant_mode/model.quant.onnx
+                engine_path=$model_dir/$quant_mode/model.quant.engine
+                precision="stronglyTyped"
+            fi
 
-        echo "Starting evaluation of $model_name for mode: $quant_mode on GPU $gpu_id"
-        if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then
-            CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
-                --onnx_path=$eval_model_path \
-                --engine_path=$engine_path \
-                --model_name="${timm_model_name[$model_name]}" \
-                --engine_precision=$precision \
-                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv &
-        else
-            CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
-                --onnx_path=$eval_model_path \
-                --engine_path=$engine_path \
-                --imagenet_path=$imagenet_path \
-                --eval_data_size=$calib_size \
-                --batch_size $batch_size \
-                --model_name="${timm_model_name[$model_name]}" \
-                --engine_precision=$precision \
-                --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv &
-        fi
-        pids+=($!)
-    done
+            echo "Starting evaluation of $model_name for mode: $quant_mode"
+            if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then
+                python evaluate.py \
+                    --onnx_path=$eval_model_path \
+                    --engine_path=$engine_path \
+                    --model_name="${timm_model_name[$model_name]}" \
+                    --engine_precision=$precision \
+                    --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
+                    --timing_cache_path=$timing_cache_path
+            else
+                python evaluate.py \
+                    --onnx_path=$eval_model_path \
+                    --engine_path=$engine_path \
+                    --imagenet_path=$imagenet_path \
+                    --eval_data_size=$eval_size \
+                    --batch_size $batch_size \
+                    --model_name="${timm_model_name[$model_name]}" \
+                    --engine_precision=$precision \
+                    --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
+                    --timing_cache_path=$timing_cache_path
+            fi
+        done
 
-    # Wait for all evaluation processes to complete for this model
-    error_occurred=false
-    for pid in "${pids[@]}"; do
-        if ! wait $pid; then
-            echo "ERROR: Evaluation process (PID: $pid) failed"
-            error_occurred=true
-        fi
+        echo "Completed evaluation of all modes for model: $model_name"
     done
-    if [ "$error_occurred" = true ]; then
-        echo "Stopping execution due to evaluation failure for model: $model_name"
-        exit 1
-    fi
-
-    echo "Completed evaluation of all modes for model: $model_name"
-done
 
-python $test_utils_dir/aggregate_results.py --results_dir=build
+    python $test_utils_dir/aggregate_results.py --results_dir=build
+fi
 
 if [ "$clean_mode" = true ]; then
     echo "Cleaning build artifacts..."
     rm -rf build/
     echo "Build artifacts cleaned successfully."
-    popd
-    exit 0
 fi
 
 popd
 
 
-echo "Total wall time: $(($(date +%s) - start_time)) seconds"
+total_seconds=$(($(date +%s) - start_time))
+printf "Total wall time: %02d:%02d:%02d\n" $((total_seconds/3600)) $(((total_seconds%3600)/60)) $((total_seconds%60))

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def export_as_onnx(`
`124`	`124`	`output_names=output_names,`
`125`	`125`	`opset_version=opset,`
`126`	`126`	`do_constant_folding=do_constant_folding,`
	`127`	`+ dynamo=False,`
`127`	`128`	`)`
`128`	`129`
`129`	`130`