[5620217]Mixed-precision handle 8bit layer name matching error (#535)

ynankani · kevalmorabia97 · commit 916e1b5cdf15 · 2025-11-12T12:43:04.000+05:30
## What does this PR do? Handle 8bit layer name matching error while running for mixed precision config **Type of change:** Bug fix **Overview:** Due to variations in export methods, the model weight_tensor.name may appear as either an ID or a name. For example: onnx::MatMul_9335 or model.layers.2.attn.qkv_proj.MatMul.weight. Need to adjust the comparison of 8bit_layers with the node names accordingly to handle this variation. ## Testing - Tested using mixed_int4_experiment.py - Executed with the downloaded model from onnx-community/Qwen2.5-1.5B-Instruct ​​​​​​​ - Also tested using the onnruntime-genai exported model from meta-llama/Llama-3.1-8B-Instruct ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  Signed-off-by: unknown <ynankani@nvidia.com>
diff --git a/modelopt/onnx/quantization/graph_utils.py b/modelopt/onnx/quantization/graph_utils.py
@@ -642,8 +642,12 @@ def _find_nodes_from_op_types_to_exclude(graph: Graph, op_types_to_exclude=None)
 def _find_int4_quantizable_weights(
     graph: onnx.GraphProto,
     nodes_to_exclude: list[str],
-) -> list[tuple[onnx.ValueInfoProto, onnx.ValueInfoProto, bool, int]]:
-    """Finds the int4 quantizable weights from the graph."""
+) -> list[tuple[onnx.ValueInfoProto, onnx.ValueInfoProto, bool, int, str]]:
+    """Finds the int4 quantizable weights from the graph.
+
+    Returns:
+        list of tuples: (act_tensor, weight_tensor, do_transpose, gemm_io_type, node_name)
+    """
     wa_pack = []
     gemm_nodes = [
         node
@@ -674,7 +678,8 @@ def _find_int4_quantizable_weights(
             attr.name == "transB" and attr.i > 0 for attr in gemm.attribute
         )
 
-        wa_pack.append((act_tensor, weight_tensor, do_transpose, gemm_io_type))
+        # Include node name for proper matching with layers_8bit_set
+        wa_pack.append((act_tensor, weight_tensor, do_transpose, gemm_io_type, gemm.name))
 
     return wa_pack
 
@@ -762,6 +767,8 @@ def get_layer_precision_mapping(
         pattern_regexes = [
             re.compile(r"^/model/layers\.(\d+)/attn/qkv_proj/MatMul$"),
             re.compile(r"^/model/layers\.(\d+)/attn/v_proj/MatMul$"),
+            re.compile(r"^/model/layers\.(\d+)/self_attn/qkv_proj/MatMul$"),
+            re.compile(r"^/model/layers\.(\d+)/self_attn/v_proj/MatMul$"),
             re.compile(r"^/model/layers\.(\d+)/mlp/down_proj/MatMul$"),
         ]
 
@@ -812,12 +819,12 @@ def layer_idx(name):
                 if (i - rest_start) % 3 == 0:
                     layers_8bit_set.add(names_sorted[i])
         layers_list_8bit = list(layers_8bit_set)
-
     # NEW: Create layer info mapping with precision, block_size, and axis
     layer_info = {}
-    for i, (act_tensor, weight_tensor, do_transpose, gemm_io_type) in enumerate(wa_pack):
+    for i, (act_tensor, weight_tensor, do_transpose, gemm_io_type, node_name) in enumerate(wa_pack):
         weight_name = weight_tensor.name
-        if should_quantize_to_8bit(weight_name, layers_list_8bit):
+        # Use node_name for matching against layers_8bit patterns
+        if should_quantize_to_8bit(node_name, layers_list_8bit):
             layer_info[weight_name] = {
                 "precision": 8,
                 "block_size": -1,  # Per-channel for 8-bit
diff --git a/modelopt/onnx/quantization/int4.py b/modelopt/onnx/quantization/int4.py
@@ -445,11 +445,11 @@ def _clip_search(
 
 def _augment_graph(
     graph: onnx.GraphProto,
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
 ):
     """Extend graph outputs with MatMuls activation input."""
     augmented_outputs = {tensor.name for tensor in graph.output}
-    for act_tensor, _, _, _ in wa_pack:
+    for act_tensor, _, _, _, _ in wa_pack:
         if act_tensor.name not in augmented_outputs:
             graph.output.append(act_tensor)
             augmented_outputs.add(act_tensor.name)
@@ -522,7 +522,7 @@ def _quantize_awq_clip(
     t = time.time()
     alphas = {}
     for i in tqdm(range(len(wa_pack)), desc="Running clip search..."):
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
 
         # First capture all the  activation values after calibration data sweep
         output_dicts = {}
@@ -554,7 +554,7 @@ def _quantize_awq_clip(
     # Compute quantized weights and scales which are needed for DQ nodes
     t = time.time()
     for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."):
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
         gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type)
 
         if force_fp16:
@@ -707,7 +707,7 @@ def get_scale(x_max, w_max, alpha):
 
 
 def run_awq_scale_search_per_node(
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
     augmented_onnx_path,
     block_size,
     use_zero_point,
@@ -728,7 +728,7 @@ def run_awq_scale_search_per_node(
         range(len(wa_pack)),
         desc="Running AWQ scale search per node" + tqdm_msg_append_str,
     ):
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
 
         output_dicts = {}
 
@@ -802,7 +802,7 @@ def run_awq_scale_search_per_node(
 
 
 def get_act_to_weight_map_and_act_to_wa_pack_map(
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
 ):
     """Method to return subgraph related maps based on activation-name as key.
 
@@ -813,7 +813,7 @@ def get_act_to_weight_map_and_act_to_wa_pack_map(
     act_to_wa_pack_map = {}
     act_to_quant_nodes_weight_shape_map = {}
     for i in tqdm(range(len(wa_pack)), desc="Getting activation names maps..."):
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
         # wa_pack index is stored in map to represent quant nodes
         act_to_wa_pack_map.setdefault(act_tensor.name, []).append(i)
         act_to_quant_nodes_weight_shape_map.setdefault(act_tensor.name, []).append(
@@ -828,7 +828,7 @@ def get_act_to_weight_map_and_act_to_wa_pack_map(
 
 
 def get_x_w_mean_for_subgraph(
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
     wa_pack_idx_list,
     augmented_onnx_path,
     x,
@@ -842,7 +842,7 @@ def get_x_w_mean_for_subgraph(
 
     w_concatenated = None
     for wa_pack_idx in wa_pack_idx_list:
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[wa_pack_idx]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[wa_pack_idx]
         w = numpy_helper.to_array(
             weight_tensor, base_dir=os.path.dirname(augmented_onnx_path)
         ).copy()
@@ -880,7 +880,7 @@ def get_x_w_mean_for_subgraph(
 
 
 def run_awq_scale_search_per_subgraph(
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
     act_to_wa_pack_map,
     act_to_quant_nodes_weight_shape_map,
     augmented_onnx_path,
@@ -931,7 +931,7 @@ def run_awq_scale_search_per_subgraph(
             awq_scale[np.isinf(awq_scale)] = 1
             awq_scale[np.isnan(awq_scale)] = 1
             for wa_pack_idx in wa_pack_idx_list:
-                _, weight_tensor, do_transpose, _ = wa_pack[wa_pack_idx]
+                _, weight_tensor, do_transpose, _, _ = wa_pack[wa_pack_idx]
                 w = numpy_helper.to_array(
                     weight_tensor, base_dir=os.path.dirname(augmented_onnx_path)
                 ).copy()
@@ -975,15 +975,15 @@ def run_awq_scale_search_per_subgraph(
 
 def get_parent_child_nodes_map(
     graph: onnx.GraphProto,
-    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int]],
+    wa_pack: list[tuple[gs.Tensor, gs.Tensor, bool, int, str]],
     nodes_to_exclude: list[str],
 ):
     """Get mapping of parent nodes to their MatMul/Gemm nodes with quantizable weights."""
     parent_child_nodes_map = {}
     output_name_to_node = get_tensor_producer_nodes(graph)
     input_name_to_nodes = get_tensor_consumer_nodes(graph)
 
-    for act_tensor, _, _, _ in wa_pack:
+    for act_tensor, _, _, _, _ in wa_pack:
         parent_name = output_name_to_node[act_tensor.name].name
         parent_child_nodes_map[parent_name] = []
         for node in input_name_to_nodes[act_tensor.name]:
@@ -1069,7 +1069,7 @@ def _quantize_awq_lite(
 
         tensor_names_list = []
         for i in tqdm(range(len(wa_pack)), desc="Getting tensor names..."):
-            act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+            act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
             tensor_names_list.append(act_tensor.name)
 
         for i in tqdm(range(len(inputs)), desc="Caching activations..."):
@@ -1157,7 +1157,7 @@ def _quantize_awq_lite(
                 awq_lite[wa_pack_idx].best_scale = mean_awq_scale
 
     for i in tqdm(range(len(wa_pack)), desc="Quantizing the weights..."):
-        act_tensor, weight_tensor, do_transpose, gemm_io_type = wa_pack[i]
+        act_tensor, weight_tensor, do_transpose, gemm_io_type, _ = wa_pack[i]
         gemm_io_type = cast("onnx.TensorProto.DataType", gemm_io_type)
 
         if force_fp16: