Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions examples/onnx_ptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,7 @@ python -m modelopt.onnx.quantization \
--quantize_mode=<int8/fp8> \
--calibration_data=calib.npy \
--calibrate_per_node \
--output_path=vit_base_patch16_224.quant.onnx \
--calibration_shapes=input:1x3x224x224
--output_path=vit_base_patch16_224.quant.onnx
```

> **Note**: Per node calibration is not available for INT4 quantization methods (`awq_clip`, `rtn_dq`)
Expand Down
2 changes: 1 addition & 1 deletion modelopt/onnx/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def quantize(
calibration_method: str = "entropy",
calibration_data_reader: CalibrationDataReader = None,
calibration_cache_path: str | None = None,
calibration_shapes: str | None = None,
calibration_shapes: str | dict | None = None,
calibration_eps: list[str] = ["cpu", "cuda:0", "trt"],
op_types_to_quantize: list[str] | None = None,
op_types_to_exclude: list[str] | None = None,
Expand Down
19 changes: 16 additions & 3 deletions modelopt/onnx/quantization/graph_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ def has_const_input(node: Node) -> bool:
return any(is_const_input(tensor) for tensor in node.inputs)


def get_input_shapes(onnx_path: str) -> dict[str, list[int]]:
"""Returns the input shapes of the given ONNX model."""
onnx_model = onnx.load(onnx_path)
input_shape_dict = {}
for input in onnx_model.graph.input:
input_shape_dict[input.name] = [x.dim_value for x in input.type.tensor_type.shape.dim]
return input_shape_dict


def has_path_type(
node: Node,
graph: Graph,
Expand Down Expand Up @@ -923,7 +932,7 @@ def find_nodes_from_matmul_to_exclude(
intermediate_generated_files: list[str] | None = None,
calibration_data_reader: CalibrationDataReader = None,
calibration_eps: list[str] = ["cpu", "cuda:0", "trt"],
calibration_shapes: str | None = None,
calibration_shapes: str | dict | None = None,
) -> list[str]:
"""Find MatMul nodes that meets gemv condition to exclude.

Expand Down Expand Up @@ -1050,7 +1059,7 @@ def find_nodes_from_convs_to_exclude(graph: Graph, quantize_mode: str = "int8"):


def _exclude_matmuls_by_symbolic_inference(
model: onnx.ModelProto, matmul_nodes: list, calibration_shapes: str | None = None
model: onnx.ModelProto, matmul_nodes: list, calibration_shapes: str | dict | None = None
) -> list[str]:
"""Use symbolic shape inference to find MatMuls with dimension 1."""
# Prepare model for symbolic inference
Expand All @@ -1061,7 +1070,11 @@ def _exclude_matmuls_by_symbolic_inference(
dim.dim_value = 1

# Apply calibration shapes if provided
input_shapes = parse_shapes_spec(calibration_shapes) if calibration_shapes else {}
input_shapes = (
parse_shapes_spec(calibration_shapes)
if (calibration_shapes and isinstance(calibration_shapes, str))
else {}
)
for graph_input in model.graph.input:
if graph_input.name in input_shapes:
input_shape = input_shapes[graph_input.name]
Expand Down
2 changes: 1 addition & 1 deletion modelopt/onnx/quantization/int8.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def quantize(
calibration_method: str = "entropy",
calibration_data_reader: CalibrationDataReader = None,
calibration_cache_path: str | None = None,
calibration_shapes: str | None = None,
calibration_shapes: str | dict | None = None,
calibration_eps: list[str] = ["cpu", "cuda:0", "trt"],
op_types_to_quantize: list[str] | None = None,
op_types_to_exclude: list[str] | None = None,
Expand Down
6 changes: 6 additions & 0 deletions modelopt/onnx/quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from modelopt.onnx.quantization.graph_utils import (
cast_custom_ops,
find_nodes_from_mha_to_exclude,
get_input_shapes,
print_stat,
remove_redundant_cast_nodes,
validate_op_types_spelling,
Expand Down Expand Up @@ -255,6 +256,8 @@ def quantize(
Path to pre-calculated activation tensor ranges, also known as calibration cache.
calibration_shapes:
Input shapes used for calibration process.
It should be provided as a string representing the shape of each input tensors for one calibration step.
Example input shapes spec: input0:1x3x256x256,input1:1x3x128x128
calibration_eps:
Priority order for the execution providers (EP) to calibrate the model.
Any subset of ['NvTensorRtRtx', 'trt', 'cuda:x', 'dml:x', 'cpu'], where 'x' is the device id.
Expand Down Expand Up @@ -467,6 +470,9 @@ def quantize(
calibration_eps,
)

if not calibration_shapes:
calibration_shapes = get_input_shapes(onnx_path)

Comment on lines +473 to +475
Copy link

@coderabbitai coderabbitai bot Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Guard against zero/unknown dims when auto-populating calibration shapes

graph_utils.get_input_shapes returns 0 for any dynamic axis (the common “batch” dim). Feeding that dict directly into the calibration readers produces zero-sized batches and the per-node calibrator errors out (“need at least one calibration sample”). We need to verify every inferred dimension is >0 (or fallback to a safe default) before adopting the auto-generated shapes; otherwise force the caller to supply explicit shapes.

🤖 Prompt for AI Agents
In modelopt/onnx/quantization/quantize.py around lines 473-475, the
auto-populated calibration_shapes from get_input_shapes may contain zero values
for dynamic axes (e.g., batch), which leads to zero-sized batches; validate
every inferred dimension and if any dimension is <= 0 either replace it with a
safe default (e.g., 1) and emit a warning or raise a clear ValueError forcing
the caller to provide explicit shapes; implement the check immediately after
calling get_input_shapes, update calibration_shapes with the fixed dimensions,
and log/raise appropriately so downstream calibration readers never receive
zero-sized dimensions.

Copy link
Contributor

@gcunhase gcunhase Oct 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ajrasane are there any risks with always setting calibration_shapes or should we only set it if --calibrate_per_node is given?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, need to check what is the shape for dynamic shapes: 1 or 0, as indicated by the CodeRabbit comment above.

if quantize_mode in ["fp8", "int8"]:
quantize_func = quantize_int8 if quantize_mode == "int8" else quantize_fp8
onnx_model = quantize_func(
Expand Down
3 changes: 3 additions & 0 deletions modelopt/onnx/trt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ def load_onnx_model(
custom_ops = []
has_custom_op = False

# Infer shapes
onnx.shape_inference.infer_shapes_path(onnx_path)

Comment on lines +269 to +271
Copy link

@coderabbitai coderabbitai bot Sep 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Do not call infer_shapes_path on large/external-data models

onnx.shape_inference.infer_shapes_path rewrites the model in-place using onnx.save_model without save_as_external_data=True. For the >2 GB ONNX graphs we routinely quantize (Llama, ViT-H, etc.), this raises ValueError: Message onnx.ModelProto exceeds maximum protobuf size of 2GB and stops the run—this worked before the change. Please switch to running shape inference on an already loaded ModelProto (e.g., onnx.shape_inference.infer_shapes(onnx_model)) and persist it via our existing save_onnx, which already handles external data formats, instead of infer_shapes_path.

🤖 Prompt for AI Agents
In modelopt/onnx/trt_utils.py around lines 269 to 271, do not call
onnx.shape_inference.infer_shapes_path(onnx_path) because it rewrites and saves
the model without external-data support and will fail for >2GB models; instead
load the ModelProto first, run onnx.shape_inference.infer_shapes(onnx_model) on
that in-memory proto, then persist the result using our existing save_onnx
function (which handles external data) so shape inference is applied but saving
uses the external-data-aware saver.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ajrasane look into this suggestion

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like the humans are having a chat. I'll hop back into my burrow for now. If you need me again, just tag @coderabbitai in a new comment, and I'll come hopping out!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is incorrect. The infer_shapes_path API is specifically designed to handle models larger than 2GB. It should also be able to handle smaller models if we pass their path: https://onnx.ai/onnx/api/shape_inference.html#infer-shapes-path

# Load the model and weights
onnx_model = onnx.load(onnx_path, load_external_data=True)
size_threshold = 2 * (1024**3) # 2GB
Expand Down