diff --git a/examples/onnx_ptq/evaluate.py b/examples/onnx_ptq/evaluate.py index cc5271515..3c2919fe0 100644 --- a/examples/onnx_ptq/evaluate.py +++ b/examples/onnx_ptq/evaluate.py @@ -38,9 +38,15 @@ def main(): parser.add_argument( "--engine_path", type=str, - required=True, + default=None, help="Path to the TensorRT engine", ) + parser.add_argument( + "--timing_cache_path", + type=str, + default=None, + help="Path to the TensorRT timing cache", + ) parser.add_argument( "--imagenet_path", type=str, default=None, help="Path to the imagenet dataset" ) @@ -81,6 +87,7 @@ def main(): # Compile the ONNX model to TRT engine and create the device model compilation_args = { "engine_path": args.engine_path, + "timing_cache_path": args.timing_cache_path, } compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args) device_model = DeviceModel(client, compiled_model, metadata={}) diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py index cb9f67ae9..055a1f26b 100644 --- a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py +++ b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py @@ -123,6 +123,7 @@ def build_engine( onnx_bytes: OnnxBytes, trt_mode: str = TRTMode.FLOAT32, engine_path: Path | None = None, + timing_cache_path: Path | None = None, calib_cache: str | None = None, dynamic_shapes: dict | None = None, plugin_config: dict | None = None, @@ -135,6 +136,7 @@ def build_engine( Args: onnx_bytes: Data of the ONNX model stored as an OnnxBytes object. engine_path: Path to save the TensorRT engine. + timing_cache_path: Path to save/load the TensorRT timing cache. trt_mode: The precision with which the TensorRT engine will be built. Supported modes are: - TRTMode.FLOAT32 - TRTMode.FLOAT16 @@ -205,6 +207,7 @@ def _build_command( def _setup_files_and_paths( tmp_dir_path: Path, engine_path: Path | None, + timing_cache_path: Path | None, ) -> tuple[Path, Path, Path | None, Path | None, Path]: tmp_onnx_dir = tmp_dir_path / "onnx" onnx_bytes.write_to_disk(str(tmp_onnx_dir)) @@ -219,13 +222,15 @@ def _setup_files_and_paths( ) engine_path.parent.mkdir(parents=True, exist_ok=True) calib_cache_path = final_output_dir / "calib_cache" if calib_cache else None - timing_cache_path = final_output_dir / "timing.cache" + timing_cache_path = ( + Path(timing_cache_path) if timing_cache_path else final_output_dir / "timing.cache" + ) return onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir with TemporaryDirectory() as tmp_dir: onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir = ( - _setup_files_and_paths(Path(tmp_dir), engine_path) + _setup_files_and_paths(Path(tmp_dir), engine_path, timing_cache_path) ) cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path) diff --git a/modelopt/torch/_deploy/_runtime/trt_client.py b/modelopt/torch/_deploy/_runtime/trt_client.py index 8d9185eec..a9c300eca 100644 --- a/modelopt/torch/_deploy/_runtime/trt_client.py +++ b/modelopt/torch/_deploy/_runtime/trt_client.py @@ -74,7 +74,7 @@ def _ir_to_compiled( Args: ir_bytes: The ONNX model bytes. compilation_args: A dictionary of compilation arguments. - The following arguments are supported: dynamic_shapes, plugin_config, engine_path. + The following arguments are supported: dynamic_shapes, plugin_config, engine_path, timing_cache_path. Returns: The compiled TRT engine bytes. @@ -87,6 +87,7 @@ def _ir_to_compiled( dynamic_shapes=compilation_args.get("dynamic_shapes"), # type: ignore[union-attr] plugin_config=compilation_args.get("plugin_config"), # type: ignore[union-attr] engine_path=compilation_args.get("engine_path"), # type: ignore[union-attr] + timing_cache_path=compilation_args.get("timing_cache_path"), # type: ignore[union-attr] trt_mode=self.deployment["precision"], verbose=(self.deployment.get("verbose", "false").lower() == "true"), ) diff --git a/tests/_test_utils/onnx_quantization/lib_test_models.py b/tests/_test_utils/onnx_quantization/lib_test_models.py index 833c27c80..a19561744 100644 --- a/tests/_test_utils/onnx_quantization/lib_test_models.py +++ b/tests/_test_utils/onnx_quantization/lib_test_models.py @@ -124,6 +124,7 @@ def export_as_onnx( output_names=output_names, opset_version=opset, do_constant_folding=do_constant_folding, + dynamo=False, ) diff --git a/tests/examples/test_onnx_ptq.sh b/tests/examples/test_onnx_ptq.sh index 9b4647a60..b85be0d40 100755 --- a/tests/examples/test_onnx_ptq.sh +++ b/tests/examples/test_onnx_ptq.sh @@ -21,7 +21,7 @@ # It is recommended to execute this script inside the Model Optimization Toolkit TensorRT Docker container. # Please ensure that the ImageNet dataset is available in the container at the specified path. -# Usage: ./test_onnx_ptq.sh [--no-clean] [/path/to/imagenet] [/path/to/models] +# Usage: ./test_onnx_ptq.sh [--no-clean] [--eval] [/path/to/imagenet] [/path/to/models] [/path/to/timing_cache] set -exo pipefail @@ -37,8 +37,10 @@ pushd $public_example_dir # Parse arguments clean_mode=true +eval_mode=false imagenet_path="" models_folder="" +timing_cache_path="" for arg in "$@"; do case $arg in @@ -46,11 +48,17 @@ for arg in "$@"; do clean_mode=false shift ;; + --eval) + eval_mode=true + shift + ;; *) if [ -z "$imagenet_path" ]; then imagenet_path="$arg" elif [ -z "$models_folder" ]; then models_folder="$arg" + elif [ -z "$timing_cache_path" ]; then + timing_cache_path="$arg" fi shift ;; @@ -63,7 +71,9 @@ export TQDM_DISABLE=1 # Setting image and model paths (contains 8 models) imagenet_path=${imagenet_path:-/data/imagenet/} models_folder=${models_folder:-/models/onnx} -calib_size=64 +timing_cache_path=${timing_cache_path:-/models/onnx/build/timing.cache} +calib_size=1 +eval_size=100 batch_size=1 @@ -137,117 +147,84 @@ for model_path in "${model_paths[@]}"; do model_name=$(basename "$model_path" .onnx) model_dir=build/$model_name - - echo "Quantizing model $model_name for all quantization modes in parallel" - pids=() - for i in "${!quant_modes[@]}"; do - quant_mode="${quant_modes[$i]}" - gpu_id=$((i % nvidia_gpu_count)) + echo "Quantizing model $model_name for all quantization modes" + for quant_mode in "${quant_modes[@]}"; do if [ "$quant_mode" == "int8_iq" ]; then continue fi - echo "Starting quantization of $model_name for mode: $quant_mode on GPU $gpu_id" - CUDA_VISIBLE_DEVICES=$gpu_id python -m modelopt.onnx.quantization \ + echo "Starting quantization of $model_name for mode: $quant_mode" + python -m modelopt.onnx.quantization \ --onnx_path=$model_dir/fp16/model.onnx \ --quantize_mode=$quant_mode \ --calibration_data=$calib_data_path \ --output_path=$model_dir/$quant_mode/model.quant.onnx \ - --calibration_eps=cuda:0 & - pids+=($!) - done - - # Wait for all quantization processes to complete for this model - error_occurred=false - for pid in "${pids[@]}"; do - if ! wait $pid; then - echo "ERROR: Quantization process (PID: $pid) failed" - error_occurred=true - fi + --calibration_eps=cuda done - if [ "$error_occurred" = true ]; then - echo "Stopping execution due to quantization failure for model: $model_name" - exit 1 - fi echo "Completed quantization of all modes for model: $model_name" done # Evaluate the quantized models for each mode -for model_path in "${model_paths[@]}"; do - model_name=$(basename "$model_path" .onnx) - model_dir=build/$model_name - - echo "Evaluating model $model_name for all quantization modes in parallel" - pids=() - for i in "${!all_modes[@]}"; do - quant_mode="${all_modes[$i]}" - gpu_id=$((i % nvidia_gpu_count)) - - if [ "$quant_mode" == "fp16" ]; then - eval_model_path=$model_dir/fp16/model.onnx - engine_path=$model_dir/fp16/model.engine - precision="fp16" - elif [ "$quant_mode" == "int8_iq" ]; then - eval_model_path=$model_dir/fp16/model.onnx - engine_path=$model_dir/int8_iq/model.engine - precision="best" - else - eval_model_path=$model_dir/$quant_mode/model.quant.onnx - engine_path=$model_dir/$quant_mode/model.quant.engine - precision="stronglyTyped" - fi +if [ "$eval_mode" = true ]; then + for model_path in "${model_paths[@]}"; do + model_name=$(basename "$model_path" .onnx) + model_dir=build/$model_name + + echo "Evaluating model $model_name for all quantization modes" + for quant_mode in "${all_modes[@]}"; do + if [ "$quant_mode" == "fp16" ]; then + eval_model_path=$model_dir/fp16/model.onnx + engine_path=$model_dir/fp16/model.engine + precision="fp16" + elif [ "$quant_mode" == "int8_iq" ]; then + eval_model_path=$model_dir/fp16/model.onnx + engine_path=$model_dir/int8_iq/model.engine + precision="best" + else + eval_model_path=$model_dir/$quant_mode/model.quant.onnx + engine_path=$model_dir/$quant_mode/model.quant.engine + precision="stronglyTyped" + fi - echo "Starting evaluation of $model_name for mode: $quant_mode on GPU $gpu_id" - if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then - CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \ - --onnx_path=$eval_model_path \ - --engine_path=$engine_path \ - --model_name="${timm_model_name[$model_name]}" \ - --engine_precision=$precision \ - --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv & - else - CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \ - --onnx_path=$eval_model_path \ - --engine_path=$engine_path \ - --imagenet_path=$imagenet_path \ - --eval_data_size=$calib_size \ - --batch_size $batch_size \ - --model_name="${timm_model_name[$model_name]}" \ - --engine_precision=$precision \ - --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv & - fi - pids+=($!) - done + echo "Starting evaluation of $model_name for mode: $quant_mode" + if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then + python evaluate.py \ + --onnx_path=$eval_model_path \ + --engine_path=$engine_path \ + --model_name="${timm_model_name[$model_name]}" \ + --engine_precision=$precision \ + --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \ + --timing_cache_path=$timing_cache_path + else + python evaluate.py \ + --onnx_path=$eval_model_path \ + --engine_path=$engine_path \ + --imagenet_path=$imagenet_path \ + --eval_data_size=$eval_size \ + --batch_size $batch_size \ + --model_name="${timm_model_name[$model_name]}" \ + --engine_precision=$precision \ + --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \ + --timing_cache_path=$timing_cache_path + fi + done - # Wait for all evaluation processes to complete for this model - error_occurred=false - for pid in "${pids[@]}"; do - if ! wait $pid; then - echo "ERROR: Evaluation process (PID: $pid) failed" - error_occurred=true - fi + echo "Completed evaluation of all modes for model: $model_name" done - if [ "$error_occurred" = true ]; then - echo "Stopping execution due to evaluation failure for model: $model_name" - exit 1 - fi - - echo "Completed evaluation of all modes for model: $model_name" -done -python $test_utils_dir/aggregate_results.py --results_dir=build + python $test_utils_dir/aggregate_results.py --results_dir=build +fi if [ "$clean_mode" = true ]; then echo "Cleaning build artifacts..." rm -rf build/ echo "Build artifacts cleaned successfully." - popd - exit 0 fi popd -echo "Total wall time: $(($(date +%s) - start_time)) seconds" +total_seconds=$(($(date +%s) - start_time)) +printf "Total wall time: %02d:%02d:%02d\n" $((total_seconds/3600)) $(((total_seconds%3600)/60)) $((total_seconds%60))