Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion examples/onnx_ptq/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ def main():
parser.add_argument(
"--engine_path",
type=str,
required=True,
default=None,
help="Path to the TensorRT engine",
)
parser.add_argument(
"--timing_cache_path",
type=str,
default=None,
help="Path to the TensorRT timing cache",
)
parser.add_argument(
"--imagenet_path", type=str, default=None, help="Path to the imagenet dataset"
)
Expand Down Expand Up @@ -81,6 +87,7 @@ def main():
# Compile the ONNX model to TRT engine and create the device model
compilation_args = {
"engine_path": args.engine_path,
"timing_cache_path": args.timing_cache_path,
}
compiled_model = client.ir_to_compiled(onnx_bytes, compilation_args)
device_model = DeviceModel(client, compiled_model, metadata={})
Expand Down
9 changes: 7 additions & 2 deletions modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def build_engine(
onnx_bytes: OnnxBytes,
trt_mode: str = TRTMode.FLOAT32,
engine_path: Path | None = None,
timing_cache_path: Path | None = None,
calib_cache: str | None = None,
dynamic_shapes: dict | None = None,
plugin_config: dict | None = None,
Expand All @@ -135,6 +136,7 @@ def build_engine(
Args:
onnx_bytes: Data of the ONNX model stored as an OnnxBytes object.
engine_path: Path to save the TensorRT engine.
timing_cache_path: Path to save/load the TensorRT timing cache.
trt_mode: The precision with which the TensorRT engine will be built. Supported modes are:
- TRTMode.FLOAT32
- TRTMode.FLOAT16
Expand Down Expand Up @@ -205,6 +207,7 @@ def _build_command(
def _setup_files_and_paths(
tmp_dir_path: Path,
engine_path: Path | None,
timing_cache_path: Path | None,
) -> tuple[Path, Path, Path | None, Path | None, Path]:
tmp_onnx_dir = tmp_dir_path / "onnx"
onnx_bytes.write_to_disk(str(tmp_onnx_dir))
Expand All @@ -219,13 +222,15 @@ def _setup_files_and_paths(
)
engine_path.parent.mkdir(parents=True, exist_ok=True)
calib_cache_path = final_output_dir / "calib_cache" if calib_cache else None
timing_cache_path = final_output_dir / "timing.cache"
timing_cache_path = (
Path(timing_cache_path) if timing_cache_path else final_output_dir / "timing.cache"
)

return onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir

with TemporaryDirectory() as tmp_dir:
onnx_path, engine_path, calib_cache_path, timing_cache_path, final_output_dir = (
_setup_files_and_paths(Path(tmp_dir), engine_path)
_setup_files_and_paths(Path(tmp_dir), engine_path, timing_cache_path)
)
cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path)

Expand Down
3 changes: 2 additions & 1 deletion modelopt/torch/_deploy/_runtime/trt_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def _ir_to_compiled(
Args:
ir_bytes: The ONNX model bytes.
compilation_args: A dictionary of compilation arguments.
The following arguments are supported: dynamic_shapes, plugin_config, engine_path.
The following arguments are supported: dynamic_shapes, plugin_config, engine_path, timing_cache_path.

Returns:
The compiled TRT engine bytes.
Expand All @@ -87,6 +87,7 @@ def _ir_to_compiled(
dynamic_shapes=compilation_args.get("dynamic_shapes"), # type: ignore[union-attr]
plugin_config=compilation_args.get("plugin_config"), # type: ignore[union-attr]
engine_path=compilation_args.get("engine_path"), # type: ignore[union-attr]
timing_cache_path=compilation_args.get("timing_cache_path"), # type: ignore[union-attr]
trt_mode=self.deployment["precision"],
verbose=(self.deployment.get("verbose", "false").lower() == "true"),
)
Expand Down
1 change: 1 addition & 0 deletions tests/_test_utils/onnx_quantization/lib_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def export_as_onnx(
output_names=output_names,
opset_version=opset,
do_constant_folding=do_constant_folding,
dynamo=False,
)


Expand Down
151 changes: 64 additions & 87 deletions tests/examples/test_onnx_ptq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# It is recommended to execute this script inside the Model Optimization Toolkit TensorRT Docker container.
# Please ensure that the ImageNet dataset is available in the container at the specified path.

# Usage: ./test_onnx_ptq.sh [--no-clean] [/path/to/imagenet] [/path/to/models]
# Usage: ./test_onnx_ptq.sh [--no-clean] [--eval] [/path/to/imagenet] [/path/to/models] [/path/to/timing_cache]

set -exo pipefail

Expand All @@ -37,20 +37,28 @@ pushd $public_example_dir

# Parse arguments
clean_mode=true
eval_mode=false
imagenet_path=""
models_folder=""
timing_cache_path=""

for arg in "$@"; do
case $arg in
--no-clean)
clean_mode=false
shift
;;
--eval)
eval_mode=true
shift
;;
*)
if [ -z "$imagenet_path" ]; then
imagenet_path="$arg"
elif [ -z "$models_folder" ]; then
models_folder="$arg"
elif [ -z "$timing_cache_path" ]; then
timing_cache_path="$arg"
fi
shift
;;
Expand All @@ -63,7 +71,9 @@ export TQDM_DISABLE=1
# Setting image and model paths (contains 8 models)
imagenet_path=${imagenet_path:-/data/imagenet/}
models_folder=${models_folder:-/models/onnx}
calib_size=64
timing_cache_path=${timing_cache_path:-/models/onnx/build/timing.cache}
calib_size=1
eval_size=100
batch_size=1


Expand Down Expand Up @@ -137,117 +147,84 @@ for model_path in "${model_paths[@]}"; do
model_name=$(basename "$model_path" .onnx)
model_dir=build/$model_name


echo "Quantizing model $model_name for all quantization modes in parallel"
pids=()
for i in "${!quant_modes[@]}"; do
quant_mode="${quant_modes[$i]}"
gpu_id=$((i % nvidia_gpu_count))
echo "Quantizing model $model_name for all quantization modes"
for quant_mode in "${quant_modes[@]}"; do
if [ "$quant_mode" == "int8_iq" ]; then
continue
fi

echo "Starting quantization of $model_name for mode: $quant_mode on GPU $gpu_id"
CUDA_VISIBLE_DEVICES=$gpu_id python -m modelopt.onnx.quantization \
echo "Starting quantization of $model_name for mode: $quant_mode"
python -m modelopt.onnx.quantization \
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this support multi-gpu calibration to use all available GPUs instead of cuda:0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should be able to control which GPU is getting used using CUDA_VISIBLE_DEVICES. However for now, I have disabled the GPU parallelism in the test till I figure out the root cause.

--onnx_path=$model_dir/fp16/model.onnx \
--quantize_mode=$quant_mode \
--calibration_data=$calib_data_path \
--output_path=$model_dir/$quant_mode/model.quant.onnx \
--calibration_eps=cuda:0 &
pids+=($!)
done

# Wait for all quantization processes to complete for this model
error_occurred=false
for pid in "${pids[@]}"; do
if ! wait $pid; then
echo "ERROR: Quantization process (PID: $pid) failed"
error_occurred=true
fi
--calibration_eps=cuda
done
if [ "$error_occurred" = true ]; then
echo "Stopping execution due to quantization failure for model: $model_name"
exit 1
fi

echo "Completed quantization of all modes for model: $model_name"
done


# Evaluate the quantized models for each mode
for model_path in "${model_paths[@]}"; do
model_name=$(basename "$model_path" .onnx)
model_dir=build/$model_name

echo "Evaluating model $model_name for all quantization modes in parallel"
pids=()
for i in "${!all_modes[@]}"; do
quant_mode="${all_modes[$i]}"
gpu_id=$((i % nvidia_gpu_count))

if [ "$quant_mode" == "fp16" ]; then
eval_model_path=$model_dir/fp16/model.onnx
engine_path=$model_dir/fp16/model.engine
precision="fp16"
elif [ "$quant_mode" == "int8_iq" ]; then
eval_model_path=$model_dir/fp16/model.onnx
engine_path=$model_dir/int8_iq/model.engine
precision="best"
else
eval_model_path=$model_dir/$quant_mode/model.quant.onnx
engine_path=$model_dir/$quant_mode/model.quant.engine
precision="stronglyTyped"
fi
if [ "$eval_mode" = true ]; then
for model_path in "${model_paths[@]}"; do
model_name=$(basename "$model_path" .onnx)
model_dir=build/$model_name

echo "Evaluating model $model_name for all quantization modes"
for quant_mode in "${all_modes[@]}"; do
if [ "$quant_mode" == "fp16" ]; then
eval_model_path=$model_dir/fp16/model.onnx
engine_path=$model_dir/fp16/model.engine
precision="fp16"
elif [ "$quant_mode" == "int8_iq" ]; then
eval_model_path=$model_dir/fp16/model.onnx
engine_path=$model_dir/int8_iq/model.engine
precision="best"
else
eval_model_path=$model_dir/$quant_mode/model.quant.onnx
engine_path=$model_dir/$quant_mode/model.quant.engine
precision="stronglyTyped"
fi

echo "Starting evaluation of $model_name for mode: $quant_mode on GPU $gpu_id"
if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then
CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
--onnx_path=$eval_model_path \
--engine_path=$engine_path \
--model_name="${timm_model_name[$model_name]}" \
--engine_precision=$precision \
--results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv &
else
CUDA_VISIBLE_DEVICES=$gpu_id python evaluate.py \
--onnx_path=$eval_model_path \
--engine_path=$engine_path \
--imagenet_path=$imagenet_path \
--eval_data_size=$calib_size \
--batch_size $batch_size \
--model_name="${timm_model_name[$model_name]}" \
--engine_precision=$precision \
--results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv &
fi
pids+=($!)
done
echo "Starting evaluation of $model_name for mode: $quant_mode"
if [[ " ${latency_models[@]} " =~ " $model_name " ]]; then
python evaluate.py \
--onnx_path=$eval_model_path \
--engine_path=$engine_path \
--model_name="${timm_model_name[$model_name]}" \
--engine_precision=$precision \
--results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
--timing_cache_path=$timing_cache_path
else
python evaluate.py \
--onnx_path=$eval_model_path \
--engine_path=$engine_path \
--imagenet_path=$imagenet_path \
--eval_data_size=$eval_size \
--batch_size $batch_size \
--model_name="${timm_model_name[$model_name]}" \
--engine_precision=$precision \
--results_path=$model_dir/$quant_mode/${model_name}_${quant_mode}.csv \
--timing_cache_path=$timing_cache_path
fi
done

# Wait for all evaluation processes to complete for this model
error_occurred=false
for pid in "${pids[@]}"; do
if ! wait $pid; then
echo "ERROR: Evaluation process (PID: $pid) failed"
error_occurred=true
fi
echo "Completed evaluation of all modes for model: $model_name"
done
if [ "$error_occurred" = true ]; then
echo "Stopping execution due to evaluation failure for model: $model_name"
exit 1
fi

echo "Completed evaluation of all modes for model: $model_name"
done

python $test_utils_dir/aggregate_results.py --results_dir=build
python $test_utils_dir/aggregate_results.py --results_dir=build
fi

if [ "$clean_mode" = true ]; then
echo "Cleaning build artifacts..."
rm -rf build/
echo "Build artifacts cleaned successfully."
popd
exit 0
fi

popd


echo "Total wall time: $(($(date +%s) - start_time)) seconds"
total_seconds=$(($(date +%s) - start_time))
printf "Total wall time: %02d:%02d:%02d\n" $((total_seconds/3600)) $(((total_seconds%3600)/60)) $((total_seconds%60))