NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions b/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions b/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎examples/onnx_ptq/evaluation.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/onnx_ptq/evaluation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/onnx_ptq/torch_quant_to_onnx.py‎
Lines changed: 9 additions & 3 deletions b/‎examples/onnx_ptq/torch_quant_to_onnx.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎examples/vlm_ptq/README.md‎
Lines changed: 9 additions & 1 deletion b/‎examples/vlm_ptq/README.md‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎examples/vlm_ptq/scripts/huggingface_example.sh‎
Lines changed: 13 additions & 5 deletions b/‎examples/vlm_ptq/scripts/huggingface_example.sh‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎examples/windows/onnx_ptq/genai_llm/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/windows/onnx_ptq/genai_llm/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/onnx/quantization/__main__.py‎
Lines changed: 4 additions & 6 deletions b/‎modelopt/onnx/quantization/__main__.py‎
Lines changed: 4 additions & 6 deletions
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
 ^^^^^^^^^^^^^^^^^
 
 **Deprecations**
+- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
 
 **Bug Fixes**
 
 **New Features**
+- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):
     )
     parser.add_argument(
         "--verbose",
-        help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",
+        help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",
         default=True,
         action=argparse.BooleanOptionalAction,
     )
 
@@ -120,7 +120,7 @@ The following evaluation requires the `val` directory of the [ImageNet dataset](
 python evaluate.py \
     --onnx_path=<path to classification model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=<fp8|int8|int4> \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
@@ -165,7 +165,7 @@ If the input model is of type image classification, use the following script to
 python evaluate.py \
     --onnx_path=<path to the exported ONNX model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=stronglyTyped \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
 
@@ -48,29 +48,22 @@ def main():
     parser.add_argument(
         "--eval_data_size", type=int, default=None, help="Number of examples to evaluate"
     )
-    # By default, TensorRT autotunes tensor types to generate the fastest engine. When you specify
-    # to TensorRT that a network is strongly typed, it infers a type for each intermediate and
-    # output tensor using the rules in the operator type specification. For networks quantized in
-    # INT4 or FP8 mode, stronglyTyped as the mode is recommended for TensorRT deployment. Though
-    # INT8 networks are generally compiled with int8 mode, certain INT8 ViT networks compiled with
-    # stronglyTyped precision have shown better performance.
     parser.add_argument(
-        "--quantize_mode",
+        "--engine_precision",
         type=str,
         default="stronglyTyped",
-        choices=["fp8", "fp16", "fp32", "int4", "int8", "int8_iq", "bf16", "best", "stronglyTyped"],
-        help="Quantization mode for the TensorRT engine. \
-            Supported options: fp8, fp16, fp32, int8, int8_iq(implicit quantization), bf16, best, stronglyTyped",
+        choices=["best", "fp16", "stronglyTyped"],
+        help="Precision mode for the TensorRT engine. \
+            stronglyTyped is recommended, all other modes have been deprecated in TensorRT",
     )
     parser.add_argument(
         "--results_path", type=str, default=None, help="Save the results to the specified path"
     )
 
     args = parser.parse_args()
-
     deployment = {
         "runtime": "TRT",
-        "precision": args.quantize_mode,
+        "precision": args.engine_precision,
     }
 
     # Create an ONNX bytes object with the specified path
 
@@ -29,7 +29,7 @@
 deployment = {
     "runtime": "TRT",
     "accelerator": "GPU",
-    "precision": "fp32",
+    "precision": "stronglyTyped",
     "onnx_opset": "21",
 }
 
 
@@ -83,12 +83,12 @@ def forward_loop(model):
     return quantized_model
 
 
-def get_model_input_shape(model_name):
+def get_model_input_shape(model_name, batch_size):
     """Get the input shape from timm model configuration."""
     model = timm.create_model(model_name, pretrained=True, num_classes=1000)
     data_config = timm.data.resolve_model_data_config(model)
     input_size = data_config["input_size"]
-    return (1, *tuple(input_size))  # Add batch dimension
+    return (batch_size, *tuple(input_size))  # Add batch dimension
 
 
 def main():
@@ -119,11 +119,17 @@ def main():
         default=512,
         help="Number of images to use in calibration [1-512]",
     )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Batch size for calibration and ONNX model export.",
+    )
 
     args = parser.parse_args()
 
     # Get input shape from model config
-    input_shape = get_model_input_shape(args.timm_model_name)
+    input_shape = get_model_input_shape(args.timm_model_name, args.batch_size)
 
     # Create model and move to appropriate device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -41,6 +41,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the-
 | Llava | llava | ✅ | ✅ | ✅ | ✅ | ❌ |
 | VILA | vila | ✅ | ✅ | ✅ | ✅ | ❌ |
 | Phi-3-vision | phi | ✅ | ✅ | ✅ | ✅ | ❌ |
+| Qwen2.5-VL | qwen | ✅ | ✅ | ✅ | ✅ | ❌ |
 
 > *<sup>1.</sup>The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \
 > *<sup>2.</sup>A selective set of the popular models are internally tested. The actual model support list may be longer. NVFP4 inference requires Blackwell GPUs and TensorRT-LLM v0.17 or later.*
@@ -51,7 +52,7 @@ Please refer to the [llm_ptq/README.md](../llm_ptq/README.md#current-out-of-the-
 
 Please refer to the [llm_ptq/README.md](../llm_ptq/README.md) about the details of model quantization.
 
-The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA and Phi-3-vision models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU.
+The following scripts provide an all-in-one and step-by-step model quantization example for Llava, VILA, Phi-3-vision and Qwen2.5-VL models. The quantization format and the number of GPUs will be supplied as inputs to these scripts. By default, we build the engine for the fp8 format and 1 GPU.
 
 ### Hugging Face Example [Script](./scripts/huggingface_example.sh)
 
@@ -76,6 +77,13 @@ git clone https://huggingface.co/microsoft/Phi-3-vision-128k-instruct
 scripts/huggingface_example.sh --type phi --model Phi-3-vision-128k-instruct --quant [fp8|int8_sq|int4_awq|w4a8_awq]
 ```
 
+For [Qwen2.5-VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct):
+
+```bash
+git clone https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
+scripts/huggingface_example.sh --type qwen --model Qwen2.5-VL-7B-Instruct --export_fmt hf --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq]
+```
+
 The example scripts above also have an additional flag `--tasks gqa`, which will trigger evaluation of the built TensorRT engine using GQA benchmark. Details of the evaluation is explained in this [tutorial](../vlm_eval/README.md).
 
 If you encounter Out of Memory (OOM) issues during inference or evaluation, you can try lowering the `--kv_cache_free_gpu_memory_fraction` argument (default is 0.8) to reduce GPU memory usage for kv_cache:
 
@@ -30,10 +30,10 @@ for i in $(env | grep ^PMI_ | cut -d"=" -f 1); do unset -v $i; done
 for i in $(env | grep ^PMIX_ | cut -d"=" -f 1); do unset -v $i; done
 
 case $MODEL_TYPE in
-    llava|phi|vila|mllama)
+    llava|phi|vila|mllama|qwen)
         ;;
     *)
-        echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama]" >&2
+        echo "Unsupported type argument: Expected one of: [llava, phi, vila, mllama, qwen]" >&2
         exit 1
 esac
 
@@ -58,10 +58,10 @@ case $SPARSITY_FMT in
 esac
 
 case $QFORMAT in
-    fp8|int8_sq|int4_awq|w4a8_awq|fp16|bf16)
+    fp8|nvfp4|int8_sq|int4_awq|w4a8_awq|fp16|bf16)
         ;;
     *)
-        echo "Unknown quant argument: Expected one of: [fp8, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2
+        echo "Unknown quant argument: Expected one of: [fp8, nvfp4, int8_sq, int4_awq, w4a8_awq, fp16, bf16]" >&2
         exit 1
 esac
 
@@ -91,7 +91,7 @@ fi
 
 BUILD_MAX_OUTPUT_LEN=512
 
-if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ]; then
+if [ "$MODEL_TYPE" = "llava" ] || [ "$MODEL_TYPE" = "vila" ] || [ "$MODEL_TYPE" = "qwen" ]; then
     BUILD_MAX_BATCH_SIZE=20
 else
     BUILD_MAX_BATCH_SIZE=4
@@ -149,6 +149,9 @@ case "${MODEL_TYPE}" in
         PTQ_ARGS+=" --kv_cache_qformat none "
         VLM_ARGS=" --max_encoder_input_len=6404 --skip_run"
         ;;
+    "qwen")
+        PTQ_ARGS+=" --kv_cache_qformat none "
+        ;;
 esac
 
 if [ "${MODEL_TYPE}" = "vila" ]; then
@@ -177,6 +180,7 @@ if [[ $TASKS =~ "build" ]] || [[ ! -d "$ENGINE_DIR" ]] || [[ ! $(ls -A $ENGINE_D
             --inference_tensor_parallel=$TP \
             --inference_pipeline_parallel=$PP \
             --export_fmt=$EXPORT_FORMAT \
+            --no-verbose \
             $PTQ_ARGS
     else
         echo "Quantized model config $MODEL_CONFIG exists, skipping the quantization stage"
@@ -213,6 +217,10 @@ case "${MODEL_TYPE}" in
     "phi")
         VISUAL_MODEL_TYPE="phi-3-vision"
         ;;
+    "qwen")
+        # Map generic type to TRT-LLM multimodal model type
+        VISUAL_MODEL_TYPE="qwen2_vl"
+        ;;
 esac
 
 
 
@@ -1,3 +1,4 @@
 datasets>=2.14.5
+onnx==1.18.0
 torch==2.6.0
 transformers==4.49.0
@@ -180,11 +180,11 @@ def get_parser() -> argparse.ArgumentParser:
     argparser.add_argument(
         "--high_precision_dtype",
         type=str,
-        default=None,
+        default="fp16",
         choices=["fp32", "fp16", "bf16"],
         help=(
-            "High precision data type, one of ['fp32', 'fp16', 'bf16']. For int8 quantization, the default value is "
-            "'fp32' and 'fp16' for other quantization modes."
+            "High precision data type of the output model. If the input model is of dtype fp32, "
+            "it will be converted to fp16 dtype by default."
         ),
     )
     argparser.add_argument(
@@ -262,8 +262,6 @@ def main():
             # Convert the NpzFile object to a Python dictionary
             calibration_data = {key: calibration_data[key] for key in calibration_data.files}
 
-    default_high_precision_dtype = "fp32" if args.quantize_mode == "int8" else "fp16"
-
     quantize(
         args.onnx_path,
         quantize_mode=args.quantize_mode,
@@ -284,7 +282,7 @@ def main():
         log_file=args.log_file,
         trt_plugins=args.trt_plugins,
         trt_plugins_precision=args.trt_plugins_precision,
-        high_precision_dtype=args.high_precision_dtype or default_high_precision_dtype,
+        high_precision_dtype=args.high_precision_dtype,
         mha_accumulation_dtype=args.mha_accumulation_dtype,
         disable_mha_qdq=args.disable_mha_qdq,
         dq_only=args.dq_only,
Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ def output_decode(generated_ids, input_shape):`
`742`	`742`	`)`
`743`	`743`	`parser.add_argument(`
`744`	`744`	`"--verbose",`
`745`		`- help="Print verbose output (e.g. quantization summary). Disable by --no_verbose.",`
	`745`	`+ help="Print verbose output (e.g. quantization summary). Disable by --no-verbose.",`
`746`	`746`	`default=True,`
`747`	`747`	`action=argparse.BooleanOptionalAction,`
`748`	`748`	`)`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`deployment = {`
`30`	`30`	`"runtime": "TRT",`
`31`	`31`	`"accelerator": "GPU",`
`32`		`- "precision": "fp32",`
	`32`	`+ "precision": "stronglyTyped",`
`33`	`33`	`"onnx_opset": "21",`
`34`	`34`	`}`
`35`	`35`