NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions b/‎examples/onnx_ptq/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/onnx_ptq/docker/Dockerfile‎
Lines changed: 5 additions & 4 deletions b/‎examples/onnx_ptq/docker/Dockerfile‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions b/‎examples/onnx_ptq/evaluate.py‎
Lines changed: 5 additions & 12 deletions
diff --git a/‎examples/onnx_ptq/evaluation.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/onnx_ptq/evaluation.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/onnx_ptq/torch_quant_to_onnx.py‎
Lines changed: 9 additions & 3 deletions b/‎examples/onnx_ptq/torch_quant_to_onnx.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎examples/windows/onnx_ptq/genai_llm/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/windows/onnx_ptq/genai_llm/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/onnx/quantization/__main__.py‎
Lines changed: 4 additions & 6 deletions b/‎modelopt/onnx/quantization/__main__.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎modelopt/onnx/quantization/int8.py‎
Lines changed: 1 addition & 1 deletion b/‎modelopt/onnx/quantization/int8.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/onnx/quantization/qdq_utils.py‎
Lines changed: 71 additions & 14 deletions b/‎modelopt/onnx/quantization/qdq_utils.py‎
Lines changed: 71 additions & 14 deletions
@@ -5,10 +5,12 @@ Model Optimizer Changelog (Linux)
 ^^^^^^^^^^^^^^^^^
 
 **Deprecations**
+- Deprecated ``quantize_mode`` argument in ``examples/onnx_ptq/evaluate.py`` to support strongly typing. Use ``engine_precision`` instead.
 
 **Bug Fixes**
 
 **New Features**
+- ``high_precision_dtype`` default to fp16 in ONNX quantization, i.e. quantized output model weights are now FP16 by default.
 
 0.35 (2025-09-04)
 ^^^^^^^^^^^^^^^^^
 
@@ -120,7 +120,7 @@ The following evaluation requires the `val` directory of the [ImageNet dataset](
 python evaluate.py \
     --onnx_path=<path to classification model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=<fp8|int8|int4> \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
@@ -165,7 +165,7 @@ If the input model is of type image classification, use the following script to
 python evaluate.py \
     --onnx_path=<path to the exported ONNX model> \
     --imagenet_path=<path to the ImageNet dataset> \
-    --quantize_mode=stronglyTyped \
+    --engine_precision=stronglyTyped \
     --model_name=vit_base_patch16_224
 ```
 
 
@@ -12,10 +12,11 @@ RUN python -m pip install --upgrade pip \
 
 WORKDIR /workspace
 
-RUN pip install tensorrt==10.13.2.6 && \
-    export TRT_PATH=$(python -c "import tensorrt; import os; print(os.path.dirname(tensorrt.__file__))") && \
-    export LD_LIBRARY_PATH="$TRT_PATH/lib:/usr/include:${LD_LIBRARY_PATH}" && \
-    export PATH="$TRT_PATH/bin:${PATH}"
+RUN pip install tensorrt==10.13.2.6
+ENV TRT_PATH=/usr/local/lib/python3.12/dist-packages/tensorrt
+ENV CUDNN_LIB_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/cudnn/lib
+ENV LD_LIBRARY_PATH="${CUDNN_LIB_DIR}:${TRT_PATH}/lib:/usr/include:${LD_LIBRARY_PATH}"
+ENV PATH="${TRT_PATH}/bin:${PATH}"
 
 # Copy application code and install requirements
 COPY modelopt modelopt/modelopt
 
@@ -48,29 +48,22 @@ def main():
     parser.add_argument(
         "--eval_data_size", type=int, default=None, help="Number of examples to evaluate"
     )
-    # By default, TensorRT autotunes tensor types to generate the fastest engine. When you specify
-    # to TensorRT that a network is strongly typed, it infers a type for each intermediate and
-    # output tensor using the rules in the operator type specification. For networks quantized in
-    # INT4 or FP8 mode, stronglyTyped as the mode is recommended for TensorRT deployment. Though
-    # INT8 networks are generally compiled with int8 mode, certain INT8 ViT networks compiled with
-    # stronglyTyped precision have shown better performance.
     parser.add_argument(
-        "--quantize_mode",
+        "--engine_precision",
         type=str,
         default="stronglyTyped",
-        choices=["fp8", "fp16", "fp32", "int4", "int8", "int8_iq", "bf16", "best", "stronglyTyped"],
-        help="Quantization mode for the TensorRT engine. \
-            Supported options: fp8, fp16, fp32, int8, int8_iq(implicit quantization), bf16, best, stronglyTyped",
+        choices=["best", "fp16", "stronglyTyped"],
+        help="Precision mode for the TensorRT engine. \
+            stronglyTyped is recommended, all other modes have been deprecated in TensorRT",
     )
     parser.add_argument(
         "--results_path", type=str, default=None, help="Save the results to the specified path"
     )
 
     args = parser.parse_args()
-
     deployment = {
         "runtime": "TRT",
-        "precision": args.quantize_mode,
+        "precision": args.engine_precision,
     }
 
     # Create an ONNX bytes object with the specified path
 
@@ -29,7 +29,7 @@
 deployment = {
     "runtime": "TRT",
     "accelerator": "GPU",
-    "precision": "fp32",
+    "precision": "stronglyTyped",
     "onnx_opset": "21",
 }
 
 
@@ -83,12 +83,12 @@ def forward_loop(model):
     return quantized_model
 
 
-def get_model_input_shape(model_name):
+def get_model_input_shape(model_name, batch_size):
     """Get the input shape from timm model configuration."""
     model = timm.create_model(model_name, pretrained=True, num_classes=1000)
     data_config = timm.data.resolve_model_data_config(model)
     input_size = data_config["input_size"]
-    return (1, *tuple(input_size))  # Add batch dimension
+    return (batch_size, *tuple(input_size))  # Add batch dimension
 
 
 def main():
@@ -119,11 +119,17 @@ def main():
         default=512,
         help="Number of images to use in calibration [1-512]",
     )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Batch size for calibration and ONNX model export.",
+    )
 
     args = parser.parse_args()
 
     # Get input shape from model config
-    input_shape = get_model_input_shape(args.timm_model_name)
+    input_shape = get_model_input_shape(args.timm_model_name, args.batch_size)
 
     # Create model and move to appropriate device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -1,3 +1,4 @@
 datasets>=2.14.5
+onnx==1.18.0
 torch==2.6.0
 transformers==4.49.0
@@ -180,11 +180,11 @@ def get_parser() -> argparse.ArgumentParser:
     argparser.add_argument(
         "--high_precision_dtype",
         type=str,
-        default=None,
+        default="fp16",
         choices=["fp32", "fp16", "bf16"],
         help=(
-            "High precision data type, one of ['fp32', 'fp16', 'bf16']. For int8 quantization, the default value is "
-            "'fp32' and 'fp16' for other quantization modes."
+            "High precision data type of the output model. If the input model is of dtype fp32, "
+            "it will be converted to fp16 dtype by default."
         ),
     )
     argparser.add_argument(
@@ -262,8 +262,6 @@ def main():
             # Convert the NpzFile object to a Python dictionary
             calibration_data = {key: calibration_data[key] for key in calibration_data.files}
 
-    default_high_precision_dtype = "fp32" if args.quantize_mode == "int8" else "fp16"
-
     quantize(
         args.onnx_path,
         quantize_mode=args.quantize_mode,
@@ -284,7 +282,7 @@ def main():
         log_file=args.log_file,
         trt_plugins=args.trt_plugins,
         trt_plugins_precision=args.trt_plugins_precision,
-        high_precision_dtype=args.high_precision_dtype or default_high_precision_dtype,
+        high_precision_dtype=args.high_precision_dtype,
         mha_accumulation_dtype=args.mha_accumulation_dtype,
         disable_mha_qdq=args.disable_mha_qdq,
         dq_only=args.dq_only,
 
@@ -124,7 +124,7 @@ def quantize(
     use_external_data_format: bool = False,
     intermediate_generated_files: list[str] = [],
     trt_extra_plugin_lib_paths: list[str] | None = None,
-    high_precision_dtype: str = "fp32",
+    high_precision_dtype: str = "fp16",
     passes: list[str] = ["concat_elimination"],
     log_level: str = "INFO",
     calibrate_per_node: bool = False,
 
@@ -23,7 +23,6 @@
 import onnx_graphsurgeon as gs
 import torch
 from onnx import numpy_helper
-from onnx.reference.custom_element_types import float8e4m3fn
 
 from modelopt.onnx import utils
 from modelopt.onnx.logging_config import logger
@@ -50,6 +49,7 @@
 onnx_dtype_map = {
     "BFloat16": onnx.TensorProto.BFLOAT16,
     "Float": onnx.TensorProto.FLOAT,
+    "Float4": onnx.TensorProto.FLOAT4E2M1,
     "Float8": onnx.TensorProto.FLOAT8E4M3FN,
     "Half": onnx.TensorProto.FLOAT16,
     "INT8": onnx.TensorProto.INT8,
@@ -529,6 +529,11 @@ def _get_successive_consumers(
     quantized_node = tensor_consumers.get(dq_node.output[0], [None])[0]
     if not quantized_node:
         raise ValueError(f"No consumer found for {dq_node.name}")
+    if quantized_node.op_type == "Cast":
+        next_node = tensor_consumers.get(quantized_node.output[0], [None])[0]
+        if not next_node:
+            raise ValueError(f"No consumer found after Cast for {quantized_node.name}")
+        quantized_node = next_node
 
     return dq_node, quantized_node
 
@@ -592,7 +597,7 @@ def _convert_weight(
     zp_array = zp_array.reshape(*reshape_dims)
 
     # Convert to INT8/FP8
-    if zp_array.dtype == float8e4m3fn:
+    if zp_array.dtype == onnx_dtype_map["Float8"]:
         scaled = np.asarray(weight_array / scale_array) + zp_array
     else:
         scaled = np.asarray((weight_array / scale_array).round())
@@ -607,17 +612,26 @@ def _cast_fp8(array: np.ndarray) -> np.ndarray:
     if torch.cuda.is_available():
         array_f32_t = array_f32_t.cuda()
     array_f8_t = array_f32_t.clamp(min=-448, max=448).to(torch.float8_e4m3fn).view(torch.uint8)
-    array_f8 = array_f8_t.cpu().numpy().astype((np.uint8, [("e4m3fn", "u1")]))
+    array_f8 = array_f8_t.cpu().numpy().astype(np.uint8)
     return array_f8
 
 
 def _cast_fp4(array: np.ndarray) -> np.ndarray:
-    """Cast a numpy array to FLOAT4E2M1 using PyTorch."""
+    """Cast a numpy array to FLOAT4E2M1 using PyTorch.
+
+    Note: The first dimension of the array must be divisible by 2
+    as two FP4 values are packed into a single byte.
+    """
     array_f32_t = torch.from_numpy(array)
+    array_f32_t_shape = array_f32_t.shape
+    assert array_f32_t_shape[0] % 2 == 0, "array_f32_t_shape[0] must be divisible by 2"
+    array_f4_t_shape = (array_f32_t_shape[0] // 2, *array_f32_t_shape[1:])
     if torch.cuda.is_available():
         array_f32_t = array_f32_t.cuda()
     array_f4_t = NVFP4QTensor._cast_fp4(array_f32_t)
-    array_f4 = array_f4_t.cpu().numpy().astype((np.uint8, [("float4e2m1", "u1")]))
+    array_f4_t = array_f4_t.flatten()
+    array_f4_t_packed = (array_f4_t[::2] | (array_f4_t[1::2] << 4)).reshape(array_f4_t_shape)
+    array_f4 = array_f4_t_packed.cpu().numpy().astype(np.uint8)
     return array_f4
 
 
@@ -685,7 +699,7 @@ def qdq_to_dq(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
             scaled = _convert_weight(weight_array, scale_array, zp_array, quantized_node)
 
             # Create and update new weight tensor
-            if zp_array.dtype == float8e4m3fn:
+            if zp_array.dtype == onnx_dtype_map["Float8"]:
                 new_weight = _create_fp8_tensor(scaled, weight_name)
                 logger.debug(f"Converted {weight_name} to FP8")
             else:
@@ -925,6 +939,10 @@ def quantize_weights_to_int4(
         assert reshape_node.op_type == "Reshape", f"Expected Reshape node for {node.name}"
         reshape_node_output = reshape_node.output[0]
 
+        # Remove constant node from reshape node
+        shape_constant_name = next(input for input in reshape_node.input if "Constant" in input)
+        nodes_to_remove.append(tensor_producer_map[shape_constant_name].name)
+
         # Get the shape of the output of the reshape node
         reshape_output_value_info = value_info_map.get(reshape_node_output)
         if reshape_output_value_info is not None:
@@ -942,12 +960,17 @@ def quantize_weights_to_int4(
         scale_shape = [*weight_shape[:-1], weight_shape[-1] // block_size]
         scale = scale.reshape(scale_shape)
         reshape_child_nodes = [n for n in graph.node if reshape_node.output[0] in n.input]
-        # reshape_node.input = []
         assert len(reshape_child_nodes) == 1, f"Expected exactly one transpose node for {node.name}"
 
+        # Remove unnecessary Cast node
+        cast_node = reshape_child_nodes[0]
+        assert cast_node.op_type == "Cast", f"Expected Cast node for {node.name}"
+        nodes_to_remove.append(cast_node.name)
+        cast_child_nodes = [n for n in graph.node if cast_node.output[0] in n.input]
+
         # Transpose weights and scales if present
-        if reshape_child_nodes[0].op_type == "Transpose":
-            transpose_node = reshape_child_nodes[0]
+        if cast_child_nodes[0].op_type == "Transpose":
+            transpose_node = cast_child_nodes[0]
             nodes_to_remove.append(transpose_node.name)
             assert transpose_node.op_type == "Transpose", f"Expected Transpose node for {node.name}"
             perm = None
@@ -964,7 +987,7 @@ def quantize_weights_to_int4(
             )
             matmul_node = transpose_child_nodes[0]
         else:
-            matmul_node = reshape_child_nodes[0]
+            matmul_node = cast_child_nodes[0]
         assert matmul_node.op_type in ["MatMul", "Gemm"], (
             f"Expected MatMul or Gemm node for {node.name}"
         )
@@ -995,9 +1018,24 @@ def quantize_weights_to_int4(
         initializer_map[weight_name].CopyFrom(weights_int4_onnx)
         logger.debug(f"Converted {weight_name} to INT4 precision")
 
+    def is_pre_quant_scale_node(node: onnx.NodeProto) -> bool:
+        has_pqs_input = any(input for input in node.input if "_pre_quant_scale" in input)
+        return node.op_type == "Mul" and has_pqs_input
+
+    # Remove unnecessay Cast after Pre-quant scale
+    for node in graph.node:
+        if is_pre_quant_scale_node(node):
+            pqs_child_nodes = [n for n in graph.node if node.output[0] in n.input]
+            assert len(pqs_child_nodes) == 1, f"Expected exactly one child node for {node.name}"
+            cast_node = pqs_child_nodes[0]
+            assert cast_node.op_type == "Cast", f"Expected Cast node for {node.name}"
+            node.output.clear()
+            node.output.extend(cast_node.output)
+            nodes_to_remove.append(cast_node.name)
+
     # Remove transpose and reshape nodes
     new_nodes = [node for node in graph.node if node.name not in nodes_to_remove]
-    graph.node.clear()
+    del graph.node[:]
     graph.node.extend(new_nodes)
 
     def is_fp32_cast(node: onnx.NodeProto) -> bool:
@@ -1009,7 +1047,7 @@ def is_fp32_cast(node: onnx.NodeProto) -> bool:
     for node in graph.node:
         if node.op_type == "Cast":
             # Skip Cast nodes that are part of normalization layers and outputs
-            if ("norm/Cast" in node.name and is_fp32_cast(node)) or node.name == "/Cast":
+            if "norm/Cast" in node.name and is_fp32_cast(node):
                 continue
             for attr in node.attribute:
                 if attr.name == "to" and attr.i == onnx.TensorProto.FLOAT:
@@ -1104,7 +1142,13 @@ def quantize_weights_to_mxfp8(
         # Expand block array so that it can be broadcasted with weight
         se8m0_fp32 = np.repeat(se8m0_fp32, block_size, axis=quant_axis)
         scaled_weight = weight / np.exp2(se8m0_fp32 - e8_m0_bias)
-        weights_e4m3 = onnx.numpy_helper.from_array(_cast_fp8(scaled_weight), weight_name)
+        weights_e4m3 = onnx.helper.make_tensor(
+            name=weight_name,
+            data_type=onnx_dtype_map["Float8"],
+            dims=[*scaled_weight.shape],
+            vals=_cast_fp8(scaled_weight).tobytes(),
+            raw=True,
+        )
         initializer_map[weight_name].CopyFrom(weights_e4m3)
         logger.debug(f"Converted {weight_name} to MXFP8")
 
@@ -1186,11 +1230,24 @@ def _add_input_value_info(graph, tensor_proto):
     sw_f32_per_tensor_name = sw_f8_per_block_name + "_f32_scale"
 
     # Create TensorProto for initializers
-    w_f4_proto = onnx.numpy_helper.from_array(w_f4, w_f4_name)
+    w_f4_proto = onnx.helper.make_tensor(
+        name=w_f4_name,
+        data_type=onnx_dtype_map["Float4"],
+        dims=[w_f4.shape[0] * 2, *w_f4.shape[1:]],
+        vals=w_f4.tobytes(),
+        raw=True,
+    )
     sw_f32_per_tensor_proto = onnx.numpy_helper.from_array(
         sw_f32_per_tensor, sw_f32_per_tensor_name
     )
     sw_f8_per_block_proto = onnx.numpy_helper.from_array(sw_f8_per_block, sw_f8_per_block_name)
+    sw_f8_per_block_proto = onnx.helper.make_tensor(
+        name=sw_f8_per_block_name,
+        data_type=onnx_dtype_map["Float8"],
+        dims=[*sw_f8_per_block.shape],
+        vals=sw_f8_per_block.tobytes(),
+        raw=True,
+    )
 
     # Add ValueInfo for the initializers if not present
     _add_input_value_info(graph, w_f4_proto)
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`	`deployment = {`
`30`	`30`	`"runtime": "TRT",`
`31`	`31`	`"accelerator": "GPU",`
`32`		`- "precision": "fp32",`
	`32`	`+ "precision": "stronglyTyped",`
`33`	`33`	`"onnx_opset": "21",`
`34`	`34`	`}`
`35`	`35`