Skip to content

Commit 31c84a8

Browse files
committed
Merge branch 'main' into jingyux/fixed-trtexec-cicd
2 parents 0164991 + 4716131 commit 31c84a8

File tree

27 files changed

+212
-94
lines changed

27 files changed

+212
-94
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
55
^^^^^^^^^^^^^^^^^
66

77
**Deprecations**
8+
- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
89

910
**Bug Fixes**
1011

1112
**New Features**
13+
- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
1214

1315
0.35 (2025-09-04)
1416
^^^^^^^^^^^^^^^^^

examples/llm_ptq/hf_ptq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):
742742
)
743743
parser.add_argument(
744744
"--verbose",
745-
help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
745+
help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
746746
default=True,
747747
action=argparse.BooleanOptionalAction,
748748
)

examples/onnx_ptq/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ The following evaluation requires the `val` directory of the [ImageNet dataset](
120120
python evaluate.py \
121121
--onnx_path=<path to classification model> \
122122
--imagenet_path=<path to the ImageNet dataset> \
123-
--quantize_mode=<fp8|int8|int4> \
123+
--engine_precision=stronglyTyped \
124124
--model_name=vit_base_patch16_224
125125
```
126126

@@ -165,7 +165,7 @@ If the input model is of type image classification, use the following script to
165165
python evaluate.py \
166166
--onnx_path=<path to the exported ONNX model> \
167167
--imagenet_path=<path to the ImageNet dataset> \
168-
--quantize_mode=stronglyTyped \
168+
--engine_precision=stronglyTyped \
169169
--model_name=vit_base_patch16_224
170170
```
171171

examples/onnx_ptq/evaluate.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -48,29 +48,22 @@ def main():
4848
parser.add_argument(
4949
"--eval_data_size", type=int, default=None, help="Number of examples to evaluate"
5050
)
51-
# By default, TensorRT autotunes tensor types to generate the fastest engine. When you specify
52-
# to TensorRT that a network is strongly typed, it infers a type for each intermediate and
53-
# output tensor using the rules in the operator type specification. For networks quantized in
54-
# INT4 or FP8 mode, stronglyTyped as the mode is recommended for TensorRT deployment. Though
55-
# INT8 networks are generally compiled with int8 mode, certain INT8 ViT networks compiled with
56-
# stronglyTyped precision have shown better performance.
5751
parser.add_argument(
58-
"--quantize_mode",
52+
"--engine_precision",
5953
type=str,
6054
default="stronglyTyped",
61-
choices=["fp8", "fp16", "fp32", "int4", "int8", "int8_iq", "bf16", "best", "stronglyTyped"],
62-
help="Quantization mode for the TensorRT engine. \
63-
Supported options: fp8, fp16, fp32, int8, int8_iq(implicit quantization), bf16, best, stronglyTyped",
55+
choices=["best", "fp16", "stronglyTyped"],
56+
help="Precision mode for the TensorRT engine. \
57+
stronglyTyped is recommended, all other modes have been deprecated in TensorRT",
6458
)
6559
parser.add_argument(
6660
"--results_path", type=str, default=None, help="Save the results to the specified path"
6761
)
6862

6963
args = parser.parse_args()
70-
7164
deployment = {
7265
"runtime": "TRT",
73-
"precision": args.quantize_mode,
66+
"precision": args.engine_precision,
7467
}
7568

7669
# Create an ONNX bytes object with the specified path

examples/onnx_ptq/evaluation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
deployment = {
3030
"runtime": "TRT",
3131
"accelerator": "GPU",
32-
"precision": "fp32",
32+
"precision": "stronglyTyped",
3333
"onnx_opset": "21",
3434
}
3535

examples/onnx_ptq/torch_quant_to_onnx.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,12 @@ def forward_loop(model):
8383
return quantized_model
8484

8585

86-
def get_model_input_shape(model_name):
86+
def get_model_input_shape(model_name, batch_size):
8787
"""Get the input shape from timm model configuration."""
8888
model = timm.create_model(model_name, pretrained=True, num_classes=1000)
8989
data_config = timm.data.resolve_model_data_config(model)
9090
input_size = data_config["input_size"]
91-
return (1, *tuple(input_size)) # Add batch dimension
91+
return (batch_size, *tuple(input_size)) # Add batch dimension
9292

9393

9494
def main():
@@ -119,11 +119,17 @@ def main():
119119
default=512,
120120
help="Number of images to use in calibration [1-512]",
121121
)
122+
parser.add_argument(
123+
"--batch_size",
124+
type=int,
125+
default=1,
126+
help="Batch size for calibration and ONNX model export.",
127+
)
122128

123129
args = parser.parse_args()
124130

125131
# Get input shape from model config
126-
input_shape = get_model_input_shape(args.timm_model_name)
132+
input_shape = get_model_input_shape(args.timm_model_name, args.batch_size)
127133

128134
# Create model and move to appropriate device
129135
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

examples/vlm_ptq/README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the-
4141
| Llava | llava ||||||
4242
| VILA | vila ||||||
4343
| Phi-3-vision | phi ||||||
44+
| Qwen2.5-VL | qwen ||||||
4445

4546
> *<sup>1.</sup>The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \
4647
> *<sup>2.</sup>A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later.*
@@ -51,7 +52,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the-
5152

5253
Please refer to the [llm_ptq/README.md](../llm_ptq/README.md) about the details of model quantization.
5354

54-
The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA and Phi-3-vision models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU.
55+
The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA, Phi-3-vision and Qwen2.5-VL models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU.
5556

5657
### Hugging Face Example [Script](./scripts/huggingface_example.sh)
5758

@@ -76,6 +77,13 @@ git clone https://huggingface.co/microsoft/Phi-3-vision-128k-instruct
7677
scripts/huggingface_example.sh --type phi --model Phi-3-vision-128k-instruct --quant [fp8|int8_sq|int4_awq|w4a8_awq]
7778
```
7879

80+
For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct):
81+
82+
```bash
83+
git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
84+
scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
85+
```
86+
7987
The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md).
8088

8189
If you encounter Out of Memory (OOM) issues during inference or evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache:

examples/vlm_ptq/scripts/huggingface_example.sh

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,10 @@ for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
3030
for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
3131

3232
case $MODEL_TYPE in
33-
llava|phi|vila|mllama)
33+
llava|phi|vila|mllama|qwen)
3434
;;
3535
*)
36-
echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama]" >&2
36+
echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2
3737
exit 1
3838
esac
3939

@@ -58,10 +58,10 @@ case $SPARSITY_FMT in
5858
esac
5959

6060
case $QFORMAT in
61-
fp8|int8_sq|int4_awq|w4a8_awq|fp16|bf16)
61+
fp8|nvfp4|int8_sq|int4_awq|w4a8_awq|fp16|bf16)
6262
;;
6363
*)
64-
echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2
64+
echo "Unknown quant argument: Expected one of: [fp8, nvfp4, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2
6565
exit 1
6666
esac
6767

@@ -91,7 +91,7 @@ fi
9191

9292
BUILD_MAX_OUTPUT_LEN=512
9393

94-
if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ]; then
94+
if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then
9595
BUILD_MAX_BATCH_SIZE=20
9696
else
9797
BUILD_MAX_BATCH_SIZE=4
@@ -149,6 +149,9 @@ case "${MODEL_TYPE}" in
149149
PTQ_ARGS+=" --kv_cache_qformat none "
150150
VLM_ARGS=" --max_encoder_input_len=6404 --skip_run"
151151
;;
152+
"qwen")
153+
PTQ_ARGS+=" --kv_cache_qformat none "
154+
;;
152155
esac
153156

154157
if [ "${MODEL_TYPE}" = "vila" ]; then
@@ -177,6 +180,7 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D
177180
--inference_tensor_parallel=$TP \
178181
--inference_pipeline_parallel=$PP \
179182
--export_fmt=$EXPORT_FORMAT \
183+
--no-verbose \
180184
$PTQ_ARGS
181185
else
182186
echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage"
@@ -213,6 +217,10 @@ case "${MODEL_TYPE}" in
213217
"phi")
214218
VISUAL_MODEL_TYPE="phi-3-vision"
215219
;;
220+
"qwen")
221+
# Map generic type to TRT-LLM multimodal model type
222+
VISUAL_MODEL_TYPE="qwen2_vl"
223+
;;
216224
esac
217225

218226

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
datasets>=2.14.5
2+
onnx==1.18.0
23
torch==2.6.0
34
transformers==4.49.0

modelopt/onnx/quantization/__main__.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,11 @@ def get_parser() -> argparse.ArgumentParser:
180180
argparser.add_argument(
181181
"--high_precision_dtype",
182182
type=str,
183-
default=None,
183+
default="fp16",
184184
choices=["fp32", "fp16", "bf16"],
185185
help=(
186-
"High precision data type, one of ['fp32', 'fp16', 'bf16']. For int8 quantization, the default value is "
187-
"'fp32' and 'fp16' for other quantization modes."
186+
"High precision data type of the output model. If the input model is of dtype fp32, "
187+
"it will be converted to fp16 dtype by default."
188188
),
189189
)
190190
argparser.add_argument(
@@ -262,8 +262,6 @@ def main():
262262
# Convert the NpzFile object to a Python dictionary
263263
calibration_data = {key: calibration_data[key] for key in calibration_data.files}
264264

265-
default_high_precision_dtype = "fp32" if args.quantize_mode == "int8" else "fp16"
266-
267265
quantize(
268266
args.onnx_path,
269267
quantize_mode=args.quantize_mode,
@@ -284,7 +282,7 @@ def main():
284282
log_file=args.log_file,
285283
trt_plugins=args.trt_plugins,
286284
trt_plugins_precision=args.trt_plugins_precision,
287-
high_precision_dtype=args.high_precision_dtype or default_high_precision_dtype,
285+
high_precision_dtype=args.high_precision_dtype,
288286
mha_accumulation_dtype=args.mha_accumulation_dtype,
289287
disable_mha_qdq=args.disable_mha_qdq,
290288
dq_only=args.dq_only,

0 commit comments

Comments
 (0)