[5506930]Add support in ModelOpt for generating mixed-precision (INT4+INT8) ONNX models, handle comments and rename functions,variables

ynankani · ynankani · commit 6201e7bda2fc · 2025-09-23T12:03:13.000+05:30
Signed-off-by: unknown &lt;ynankani@nvidia.com&gt;
diff --git a/examples/windows/onnx_ptq/genai_llm/README.md b/examples/windows/onnx_ptq/genai_llm/README.md
@@ -57,6 +57,7 @@ The table below lists key command-line arguments of the ONNX PTQ example script.
 | `--calibration_eps` | dml, cuda, cpu, NvTensorRtRtx (default: [dml,cpu]) | List of execution-providers to use for session run during calibration |
 | `--no_position_ids` | Default: position_ids input enabled | Use this option to disable position_ids input in calibration data|
 | `--enable_mixed_quant` | Default: disabled mixed quant | Use this option to enable mixed precsion quantization|
+| `--layers_8bit` | Default: None | Use this option to Overrides default mixed quant strategy|
 
 Run the following command to view all available parameters in the script:
 
diff --git a/examples/windows/onnx_ptq/genai_llm/quantize.py b/examples/windows/onnx_ptq/genai_llm/quantize.py
@@ -361,11 +361,15 @@ def main(args):
     # device = torch.device(f"cuda:{device_id}")
     device = torch.device(args.device)
 
+    if args.layers_8bit:
+        args.enable_mixed_quant = True
+
     print(
         f"\n--Quantize-Script-- algo={args.algo}, dataset={args.dataset}, calib_size={args.calib_size}, "
         f"batch_size={args.batch_size}, block_size={args.block_size}, add-position-ids={args.add_position_ids}, "
         f"past-kv={args.add_past_kv_inputs}, rcalib={args.use_random_calib}, device={args.device}, "
-        f"use_zero_point={args.use_zero_point}, use_fp32={args.use_fp32} enable_mixed_quant={args.enable_mixed_quant}\n"
+        f"use_zero_point={args.use_zero_point}, use_fp32={args.use_fp32} enable_mixed_quant={args.enable_mixed_quant}, "
+        f"layers_8bit={args.layers_8bit}\n"
     )
 
     print(
@@ -436,7 +440,7 @@ def main(args):
         awqclip_alpha_min=args.awqclip_alpha_min,
         awqclip_bsz_col=args.awqclip_bsz_col,
         enable_mixed_quant=args.enable_mixed_quant,
-        int8_layers=args.int8_layers,
+        layers_8bit=args.layers_8bit,
     )
     logging.info(f"\nQuantization process took {time.time() - t} seconds")
 
@@ -600,16 +604,16 @@ def main(args):
         "--enable_mixed_quant",
         default=False,
         action="store_true",
-        help="True when we want to use mixed quantization",
+        help=(
+            "Use default mixed quantization strategy: first 1/8, last 1/8, and every 3rd attn, "
+            "mlp layers quantized to 8 bits; others to 4 bits."
+        ),
     )
     parser.add_argument(
-        "--int8_layers",
+        "--layers_8bit",
         type=str,
         default="",
-        help=(
-            "Comma-separated list of layer patterns to quantize to INT8 instead of INT4."
-            "Example: 'layers.0,layers.1,lm_head'"
-        ),
+        help=("Overrides default mixed quant strategy. Example: 'layers.0,lm_head'"),
     )
     args = parser.parse_args()
     main(args)
diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
@@ -626,11 +626,11 @@ def _find_nodes_from_op_types_to_exclude(graph: Graph, op_types_to_exclude=None)
     return nodes_to_exclude
 
 
-def _find_quantizable_weights(
+def _find_int4_quantizable_weights(
     graph: onnx.GraphProto,
     nodes_to_exclude: list[str],
 ) -> list[tuple[onnx.ValueInfoProto, onnx.ValueInfoProto, bool, int]]:
-    """Finds the quantizable weights from the graph."""
+    """Finds the int4 quantizable weights from the graph."""
     wa_pack = []
     gemm_nodes = [
         node
@@ -666,27 +666,27 @@ def _find_quantizable_weights(
     return wa_pack
 
 
-def should_quantize_to_int8(layer_name: str, int8_layers: list[str]):
-    """Check if layer should be quantized to INT8.
+def should_quantize_to_8bit(layer_name: str, layers_8bit: list[str]):
+    """Check if layer should be quantized to 8 bits.
 
-    The int8_layers list contains ONNX node names like '/model/layers.13/attn/qkv_proj/MatMul'.
+    The layers_8bit list contains ONNX node names like '/model/layers.13/attn/qkv_proj/MatMul'.
     The layer_name argument is an ONNX initializer name like 'model.layers.13.attn.qkv_proj.MatMul.weight'.
 
     To match these, we:
       - Remove the leading slash from the node name.
       - Replace all '/' with '.' to match the naming convention of the initializer.
 
-    This allows us to correctly identify which weights should be quantized to INT8.
+    This allows us to correctly identify which weights should be quantized to 8 bits.
     """
-    if not int8_layers:
+    if not layers_8bit:
         return False
 
     # Normalize both to dot-delimited tokens and require exact token sequence match.
     def tokens(s: str) -> list[str]:
         return s.lstrip("/").replace("/", ".").split(".")
 
     hay = tokens(layer_name)
-    for pat in int8_layers:
+    for pat in layers_8bit:
         needle = tokens(pat)
         n, m = len(hay), len(needle)
         for i in range(n - m + 1):
@@ -695,44 +695,42 @@ def tokens(s: str) -> list[str]:
     return False
 
 
-def validate_int8_layers(layers_str: str) -> bool:
-    """Validate the format of int8_layers string."""
+def validate_8bit_layers(layers_str: str) -> bool:
+    """Validate the format of layers_8bit string."""
     if not layers_str:
         return True
-    # Basic validation: check for valid characters and structure
-    import re
-
-    pattern = r"^[a-zA-Z0-9_.,\-]$"
+    # Allow comma-separated list of path-like tokens
+    pattern = r"^\s*[/a-zA-Z0-9_.\-]+(\s*,\s*[/a-zA-Z0-9_.\-]+)*\s*$"
     return bool(re.match(pattern, layers_str))
 
 
 def get_layer_precision_mapping(
     onnx_model: onnx.ModelProto,
-    int8_precision_pattern: str | None = None,
+    precision_pattern_8bit: str | None = None,
     nodes_to_exclude: list[str] | None = [r"/lm_head"],
 ):
-    """Generate a mapping of layer names to their quantization precision (INT4 or INT8) for an ONNX model.
+    """Generate a mapping of layer names to their quantization precision (4 bits or 8 bits) for an ONNX model.
 
     Args:
         onnx_model (onnx.ModelProto): The ONNX model to analyze.
-        int8_precision_pattern (str, optional): Comma-separated string of layer patterns to quantize to INT8.
-            If None, a default set of patterns is used to select layers for INT8 quantization.
+        precision_pattern_8bit (str, optional): Comma-separated string of layer patterns to quantize to 8 bits.
+            If None, a default set of patterns is used to select layers for 8 bits quantization.
         nodes_to_exclude (list[str], optional): List of node name patterns to exclude from quantization.
             Defaults to [r"/lm_head"].
 
     Returns:
-        dict: A mapping from layer names to their quantization precision (e.g., {"layer_name": "int8"}).
+        dict: A mapping from layer names to their quantization precision (e.g., {"layer_name": "8"}).
     """
     graph = onnx_model.graph
 
     nodes_to_exclude = expand_node_names_from_patterns(graph, nodes_to_exclude)
     # Collect quantizable weight tensors
-    wa_pack = _find_quantizable_weights(graph, nodes_to_exclude)
+    wa_pack = _find_int4_quantizable_weights(graph, nodes_to_exclude)
 
-    if int8_precision_pattern:
-        if not validate_int8_layers(int8_precision_pattern):
-            raise ValueError("Invalid format for --int8_layers. Use comma-separated layers.")
-        int8_layers_list = [x.strip() for x in int8_precision_pattern.split(",") if x.strip()]
+    if precision_pattern_8bit:
+        if not validate_8bit_layers(precision_pattern_8bit):
+            raise ValueError("Invalid format for --layers_8bit. Use comma-separated layers.")
+        layers_list_8bit = [x.strip() for x in precision_pattern_8bit.split(",") if x.strip()]
 
     else:
         matmul_nodes = [
@@ -773,7 +771,7 @@ def extract_group_key(node_name):
             group_key = extract_group_key(node.name)
             group_to_nodes.setdefault(group_key, []).append(node.name)
 
-        int8_layers_set = set()
+        layers_8bit_set = set()
         for names in group_to_nodes.values():
             n = len(names)
             if n == 0:
@@ -788,23 +786,23 @@ def layer_idx(name):
             first_eighth = int(n // 8)
             last_eighth = int(n // 8)
             # First 1/8
-            int8_layers_set.update(names_sorted[:first_eighth])
+            layers_8bit_set.update(names_sorted[:first_eighth])
             # Last 1/8
             if last_eighth > 0:
-                int8_layers_set.update(names_sorted[-last_eighth:])
+                layers_8bit_set.update(names_sorted[-last_eighth:])
             # Every third in the rest (excluding first and last eighth)
             rest_start = first_eighth
             rest_end = n - last_eighth
             for i in range(rest_start, rest_end):
                 if (i - rest_start) % 3 == 0:
-                    int8_layers_set.add(names_sorted[i])
-        int8_layers_list = list(int8_layers_set)
+                    layers_8bit_set.add(names_sorted[i])
+        layers_list_8bit = list(layers_8bit_set)
 
     # NEW: Create precision info mapping
     precision_info = {}
     for i, (act_tensor, weight_tensor, do_transpose, gemm_io_type) in enumerate(wa_pack):
         weight_name = weight_tensor.name
-        if should_quantize_to_int8(weight_name, int8_layers_list):
+        if should_quantize_to_8bit(weight_name, layers_list_8bit):
             precision_info[weight_name] = 8
         else:
             precision_info[weight_name] = 4
@@ -827,17 +825,17 @@ def get_precision_info(
         nodes_to_exclude (list[str] | None): List of node name patterns to exclude from quantization.
         **kwargs: Additional keyword arguments, such as:
             - enable_mixed_quant (bool): Whether to enable mixed quantization.
-            - int8_layers (str): Comma-separated list of layer patterns to quantize to INT8.
+            - layers_8bit (str): Comma-separated list of layer patterns to quantize to 8 bit.
 
     Returns:
         dict[str, int] | None: A mapping from weight tensor names to their quantization precision,
         or None if mixed quantization is not enabled.
     """
     precision_info = None
     enable_mixed_quant = kwargs.get("enable_mixed_quant", False)
-    int8_layers = kwargs.get("int8_layers")
+    layers_8bit = kwargs.get("layers_8bit")
     if enable_mixed_quant:
-        precision_info = get_layer_precision_mapping(onnx_model, int8_layers, nodes_to_exclude)
+        precision_info = get_layer_precision_mapping(onnx_model, layers_8bit, nodes_to_exclude)
     else:
         precision_info = None
     return precision_info
diff --git a/modelopt/onnx/quantization/int4.py b/modelopt/onnx/quantization/int4.py
@@ -36,7 +36,9 @@
 from modelopt.onnx.op_types import is_fusible_scaling_op
 from modelopt.onnx.quantization.calib_utils import RandomDataProvider
 from modelopt.onnx.quantization.graph_utils import (
-    _find_quantizable_weights,
+    _find_int4_quantizable_weights as _find_quantizable_weights,
+)
+from modelopt.onnx.quantization.graph_utils import (
     expand_node_names_from_patterns,
     get_precision_info,
     get_tensor_consumer_nodes,
@@ -50,9 +52,9 @@
     find_scales,
     get_num_bits,
     quant_tensor,
+    reshape_scales_for_per_channel_nodes,
     rtn,
     update_block_size,
-    update_scale_map_for_per_channel_nodes,
 )
 from modelopt.onnx.utils import save_onnx
 
@@ -121,6 +123,7 @@ def _quantize_gather_nodes(
                     continue
                 name = in_tensor.name
                 w = in_tensor.values
+                # Updating the block size as for 8bit quantization, per-channel quantization is used.
                 num_bits = get_num_bits(precision_info, name)
                 block_size_updated = update_block_size(
                     num_bits, block_size, w=w, quantize_axis=gather_quantize_axis
@@ -170,7 +173,7 @@ def _quantize_gather_nodes(
         )
     else:
         logger.info("Found 0 Gather nodes to quantize")
-    scales_map = update_scale_map_for_per_channel_nodes(scales_map, block_size, precision_info)
+    scales_map = reshape_scales_for_per_channel_nodes(scales_map, block_size, precision_info)
     return weights_map, scales_map, zero_point_map
 
 
@@ -221,6 +224,7 @@ def quantize_rtn(
     precision_info = get_precision_info(onnx_model, nodes_to_exclude, **kwargs)
     for name, w in gemm_weights.items():
         logger.debug(f"Computing scales for weight {name} of shape {w.shape}")
+        # Updating the block size as for 8bit quantization, per-channel quantization is used.
         num_bits = get_num_bits(precision_info, name)
         block_size_updated = update_block_size(num_bits, block_size, w=w)
         s, zp = find_scales(np.asarray(w), block_size_updated, num_bits=num_bits)
@@ -258,14 +262,15 @@ def quantize_rtn(
         gemm_weights_quantized = {}
         for name, w in gemm_weights.items():
             logger.debug(f"Quantizing weight {name}")
+            # Updating the block size as for 8bit quantization, per-channel quantization is used.
             num_bits = get_num_bits(precision_info, name)
             block_size_updated = update_block_size(num_bits, block_size, w=w)
             qw = rtn(np.asarray(w), scales[name], block_size_updated, num_bits=num_bits)
             if has_cupy:
                 qw = np.asnumpy(qw)
                 scales[name] = np.asnumpy(scales[name])
             gemm_weights_quantized[name] = numpy.asarray(qw)
-        scales = update_scale_map_for_per_channel_nodes(scales, block_size, precision_info)
+        scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
         qdq.insert_dq_nodes(
             graph,
             scales,
@@ -285,7 +290,7 @@ def quantize_rtn(
         if has_cupy:
             for name in scales:
                 scales[name] = np.asnumpy(scales[name])
-        scales = update_scale_map_for_per_channel_nodes(scales, block_size, precision_info)
+        scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
         qdq.insert_qdq_nodes(graph, scales, weight_map=gemm_tensors, precision_info=precision_info)
         if gather_w_map is not None:
             assert gather_s_map is not None, "scale-map not found for quantizable gather nodes"
@@ -497,6 +502,7 @@ def _quantize_awq_clip(
             w = w.T
         w = np.asarray(w)
         num_bits = get_num_bits(precision_info, weight_tensor.name)
+        # Updating the block size as for 8bit quantization, per-channel quantization is used.
         block_size_updated = update_block_size(num_bits, block_size, w=w)
         awq_clip = AWQClipHelper(w, block_size_updated, **kwargs)
         _clip_search(x, w, awq_clip, num_bits=num_bits, **kwargs)
@@ -524,7 +530,9 @@ def _quantize_awq_clip(
 
         alpha = alphas.get(weight_tensor.name, 1)
         num_bits = get_num_bits(precision_info, weight_tensor.name)
-        qw, scale, _ = quant_tensor(w, block_size, alpha=alpha, num_bits=num_bits)
+        # Updating the block size as for 8bit quantization, per-channel quantization is used.
+        block_size_updated = update_block_size(num_bits, block_size, w=w)
+        qw, scale, _ = quant_tensor(w, block_size_updated, alpha=alpha, num_bits=num_bits)
         if has_cupy:
             qw = np.asnumpy(qw)
             scale = np.asnumpy(scale)
@@ -561,7 +569,7 @@ def _quantize_awq_clip(
 
     t = time.time()
     dq_node_attributes = {"axis": 0, "block_size": block_size}
-    scales = update_scale_map_for_per_channel_nodes(scales, block_size, precision_info)
+    scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
     qdq.insert_dq_nodes(
         graph_gs,
         scales,
@@ -716,6 +724,7 @@ def run_awq_scale_search_per_node(
         x = np.concatenate(output_dicts[act_tensor.name], axis=0).reshape(
             (-1, w.shape[0])
         )  # n_token, ci
+        # Updating the block size as for 8bit quantization, per-channel quantization is used.
         num_bits = get_num_bits(precision_info, weight_tensor.name)
         block_size_updated = update_block_size(num_bits, block_size, w=w)
         awq_lite[i] = AWQLiteHelper(x, w, block_size_updated, **kwargs)
@@ -1129,6 +1138,7 @@ def _quantize_awq_lite(
         assert enable_weight_clipping or (alpha == 1), (
             "clip range enabled without enabling weight-clipping param"
         )
+        # Updating the block size as for 8bit quantization, per-channel quantization is used.
         num_bits = get_num_bits(precision_info, weight_tensor.name)
         block_size_updated = update_block_size(num_bits, block_size, w=w_scaled)
         qw, scale, zp = quant_tensor(
@@ -1262,7 +1272,7 @@ def _quantize_awq_lite(
 
     t = time.time()
     dq_node_attributes = {"axis": 0, "block_size": block_size}
-    scales = update_scale_map_for_per_channel_nodes(scales, block_size, precision_info)
+    scales = reshape_scales_for_per_channel_nodes(scales, block_size, precision_info)
     qdq.insert_dq_nodes(
         graph_gs,
         scales,
@@ -1371,7 +1381,7 @@ def quantize(
                                               Default: 32.
                 - **enable_mixed_quant** (bool): If True, enable mixed quantization.
                                               Default: False.
-                - **int8_layers** (str): comma-separated list of layer patterns to quantize to INT8 instead of INT4.
+                - **layers_8bit** (str): comma-separated list of layer patterns to quantize to INT8 instead of INT4.
                                               Default: [].
     **Returns**: A quantized ONNX model in ONNX ModelProto format.
     """
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
diff --git a/modelopt/onnx/quantization/quant_utils.py b/modelopt/onnx/quantization/quant_utils.py