-
Notifications
You must be signed in to change notification settings - Fork 190
Update onnx ptq test to be single threaded and make it faster #415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
685183d
0951c2c
276e2dd
7be261c
f74f12e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ | |
| # It is recommended to execute this script inside the Model Optimization Toolkit TensorRT Docker container. | ||
| # Please ensure that the ImageNet dataset is available in the container at the specified path. | ||
|
|
||
| # Usage: ./test_onnx_ptq.sh [--no-clean] [/path/to/imagenet] [/path/to/models] | ||
| # Usage: ./test_onnx_ptq.sh [--no-clean] [--eval] [/path/to/imagenet] [/path/to/models] | ||
|
|
||
| set -exo pipefail | ||
|
|
||
|
|
@@ -37,6 +37,7 @@ pushd $public_example_dir | |
|
|
||
| # Parse arguments | ||
| clean_mode=true | ||
| eval_mode=false | ||
| imagenet_path="" | ||
| models_folder="" | ||
|
|
||
|
|
@@ -46,6 +47,10 @@ for arg in "$@"; do | |
| clean_mode=false | ||
| shift | ||
| ;; | ||
| --eval) | ||
| eval_mode=true | ||
| shift | ||
| ;; | ||
| *) | ||
| if [ -z "$imagenet_path" ]; then | ||
| imagenet_path="$arg" | ||
|
|
@@ -63,7 +68,8 @@ export TQDM_DISABLE=1 | |
| # Setting image and model paths (contains 8 models) | ||
| imagenet_path=${imagenet_path:-/data/imagenet/} | ||
| models_folder=${models_folder:-/models/onnx} | ||
| calib_size=64 | ||
| calib_size=1 | ||
| eval_size=100 | ||
| batch_size=1 | ||
|
|
||
|
|
||
|
|
@@ -137,117 +143,84 @@ for model_path in "${model_paths[@]}"; do | |
| model_name=$(basename "$model_path" .onnx) | ||
| model_dir=build/$model_name | ||
|
|
||
|
|
||
| echo "Quantizing model $model_name for all quantization modes in parallel" | ||
| pids=() | ||
| for i in "${!quant_modes[@]}"; do | ||
| quant_mode="${quant_modes[$i]}" | ||
| gpu_id=$((i % nvidia_gpu_count)) | ||
| echo "Quantizing model $model_name for all quantization modes" | ||
| for quant_mode in "${quant_modes[@]}"; do | ||
| if [ "$quant_mode" == "int8_iq" ]; then | ||
| continue | ||
| fi | ||
|
|
||
| echo "Starting quantization of $model_name for mode: $quant_mode on GPU $gpu_id" | ||
| CUDA_VISIBLE_DEVICES=$gpu_id python -m modelopt.onnx.quantization \ | ||
| echo "Starting quantization of $model_name for mode: $quant_mode" | ||
| python -m modelopt.onnx.quantization \ | ||
| --onnx_path=$model_dir/fp16/model.onnx \ | ||
| --quantize_mode=$quant_mode \ | ||
| --calibration_data=$calib_data_path \ | ||
| --output_path=$model_dir/$quant_mode/model.quant.onnx \ | ||
| --calibration_eps=cuda:0 & | ||
| pids+=($!) | ||
| done | ||
|
|
||
| # Wait for all quantization processes to complete for this model | ||
| error_occurred=false | ||
| for pid in "${pids[@]}"; do | ||
| if ! wait $pid; then | ||
| echo "ERROR: Quantization process (PID: $pid) failed" | ||
| error_occurred=true | ||
| fi | ||
| --calibration_eps=cuda | ||
| done | ||
| if [ "$error_occurred" = true ]; then | ||
| echo "Stopping execution due to quantization failure for model: $model_name" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Completed quantization of all modes for model: $model_name" | ||
| done | ||
|
|
||
|
|
||
| # Evaluate the quantized models for each mode | ||
| for model_path in "${model_paths[@]}"; do | ||
| model_name=$(basename "$model_path" .onnx) | ||
| model_dir=build/$model_name | ||
|
|
||
| echo "Evaluating model $model_name for all quantization modes in parallel" | ||
| pids=() | ||
| for i in "${!all_modes[@]}"; do | ||
| quant_mode="${all_modes[$i]}" | ||
| gpu_id=$((i % nvidia_gpu_count)) | ||
|
|
||
| if [ "$quant_mode" == "fp16" ]; then | ||
| eval_model_path=$model_dir/fp16/model.onnx | ||
| engine_path=$model_dir/fp16/model.engine | ||
| precision="fp16" | ||
| elif [ "$quant_mode" == "int8_iq" ]; then | ||
| eval_model_path=$model_dir/fp16/model.onnx | ||
| engine_path=$model_dir/int8_iq/model.engine | ||
| precision="best" | ||
| else | ||
| eval_model_path=$model_dir/$quant_mode/model.quant.onnx | ||
| engine_path=$model_dir/$quant_mode/model.quant.engine | ||
| precision="stronglyTyped" | ||
| fi | ||
| if [ "$eval_mode" = true ]; then | ||
| for model_path in "${model_paths[@]}"; do | ||
| model_name=$(basename "$model_path" .onnx) | ||
| model_dir=build/$model_name | ||
|
|
||
| echo "Evaluating model $model_name for all quantization modes" | ||
| for quant_mode in "${all_modes[@]}"; do | ||
| if [ "$quant_mode" == "fp16" ]; then | ||
| eval_model_path=$model_dir/fp16/model.onnx | ||
| engine_path=$model_dir/fp16/model.engine | ||
| precision="fp16" | ||
| elif [ "$quant_mode" == "int8_iq" ]; then | ||
| eval_model_path=$model_dir/fp16/model.onnx | ||
| engine_path=$model_dir/int8_iq/model.engine | ||
| precision="best" | ||
| else | ||
| eval_model_path=$model_dir/$quant_mode/model.quant.onnx | ||
| engine_path=$model_dir/$quant_mode/model.quant.engine | ||
| precision="stronglyTyped" | ||
| fi | ||
|
|
||
| echo "Starting evaluation of $model_name for mode: $quant_mode on GPU $gpu_id" | ||
| if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then | ||
| CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \ | ||
| --onnx_path=$eval_model_path \ | ||
| --engine_path=$engine_path \ | ||
| --model_name="${timm_model_name[$model_name]}" \ | ||
| --engine_precision=$precision \ | ||
| --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv & | ||
| else | ||
| CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \ | ||
| --onnx_path=$eval_model_path \ | ||
| --engine_path=$engine_path \ | ||
| --imagenet_path=$imagenet_path \ | ||
| --eval_data_size=$calib_size \ | ||
| --batch_size $batch_size \ | ||
| --model_name="${timm_model_name[$model_name]}" \ | ||
| --engine_precision=$precision \ | ||
| --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv & | ||
| fi | ||
| pids+=($!) | ||
| done | ||
| echo "Starting evaluation of $model_name for mode: $quant_mode" | ||
| if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then | ||
| python evaluate.py \ | ||
| --onnx_path=$eval_model_path \ | ||
| --engine_path=$engine_path \ | ||
| --model_name="${timm_model_name[$model_name]}" \ | ||
| --engine_precision=$precision \ | ||
| --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \ | ||
| --timing_cache_path=build/timing.cache | ||
|
||
| else | ||
| python evaluate.py \ | ||
| --onnx_path=$eval_model_path \ | ||
| --engine_path=$engine_path \ | ||
| --imagenet_path=$imagenet_path \ | ||
| --eval_data_size=$eval_size \ | ||
| --batch_size $batch_size \ | ||
| --model_name="${timm_model_name[$model_name]}" \ | ||
| --engine_precision=$precision \ | ||
| --results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \ | ||
| --timing_cache_path=build/timing.cache | ||
| fi | ||
| done | ||
|
|
||
| # Wait for all evaluation processes to complete for this model | ||
| error_occurred=false | ||
| for pid in "${pids[@]}"; do | ||
| if ! wait $pid; then | ||
| echo "ERROR: Evaluation process (PID: $pid) failed" | ||
| error_occurred=true | ||
| fi | ||
| echo "Completed evaluation of all modes for model: $model_name" | ||
| done | ||
| if [ "$error_occurred" = true ]; then | ||
| echo "Stopping execution due to evaluation failure for model: $model_name" | ||
| exit 1 | ||
| fi | ||
|
|
||
| echo "Completed evaluation of all modes for model: $model_name" | ||
| done | ||
|
|
||
| python $test_utils_dir/aggregate_results.py --results_dir=build | ||
| python $test_utils_dir/aggregate_results.py --results_dir=build | ||
| fi | ||
|
|
||
| if [ "$clean_mode" = true ]; then | ||
| echo "Cleaning build artifacts..." | ||
| rm -rf build/ | ||
| echo "Build artifacts cleaned successfully." | ||
| popd | ||
| exit 0 | ||
| fi | ||
|
|
||
| popd | ||
|
|
||
|
|
||
| echo "Total wall time: $(($(date +%s) - start_time)) seconds" | ||
| total_seconds=$(($(date +%s) - start_time)) | ||
| printf "Total wall time: %02d:%02d:%02d\n" $((total_seconds/3600)) $(((total_seconds%3600)/60)) $((total_seconds%60)) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this support multi-gpu calibration to use all available GPUs instead of cuda:0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, we should be able to control which GPU is getting used using CUDA_VISIBLE_DEVICES. However for now, I have disabled the GPU parallelism in the test till I figure out the root cause.