Implement function to detect tensor inputs to keep in high precision

gcunhase · gcunhase · commit 7c854c7a39fa · 2025-11-17T09:38:10.000-05:00
Signed-off-by: gcunhase &lt;4861122+gcunhase@users.noreply.github.com&gt;
diff --git a/modelopt/onnx/autocast/convert.py b/modelopt/onnx/autocast/convert.py
@@ -236,12 +236,11 @@ def convert_to_f16(
         keep_io_types=keep_io_types,
         low_precision_type=low_precision_type,
         custom_ops=sanitizer.custom_ops,
+        tensor_block_dict=tensor_block_dict,
     )
     high_precision_nodes = [node.name for node in model.graph.node if node.op_type in op_block_list]
     low_precision_nodes = [
         node.name for node in model.graph.node if node.op_type not in op_block_list
     ]
-    model_mod = precision_converter.convert(
-        high_precision_nodes, low_precision_nodes, tensor_block_dict
-    )
+    model_mod = precision_converter.convert(high_precision_nodes, low_precision_nodes)
     return model_mod
diff --git a/modelopt/onnx/autocast/precisionconverter.py b/modelopt/onnx/autocast/precisionconverter.py
@@ -99,6 +99,7 @@ def __init__(
         min_opset: int = 13,
         max_ir_version: int | None = None,
         trt_plugins: list[str] | None = [],
+        tensor_block_dict: dict[str, dict[str, list[int]]] = {},
     ) -> None:
         """Initialize PrecisionConverter.
 
@@ -112,6 +113,10 @@ def __init__(
             init_conversion_max_bytes: Maximum size in bytes for initializer conversion. Larger initializers will be
                                        cast at runtime.
             custom_ops: List of custom ops.
+            min_opset: Minimum opset for conversion.
+            max_ir_version: Max IR version for conversion.
+            trt_plugins: List of custom TensorRT plugin library paths in .so format (compiled shared library).
+            tensor_block_dict: Dictionary of tensors (operation type and I/O indices) that should remain in FP32.
         """
         self.model = deepcopy(model)
         self.value_info_map = value_info_map
@@ -148,18 +153,19 @@ def __init__(
             )
         )
 
+        # Custom mapping of op types to indices of inputs that should not be converted to low precision
+        self.skip_inputs_map = self._create_skip_inputs_mapping(tensor_block_dict)
+
     def convert(
         self,
         high_precision_nodes: list[str],
         low_precision_nodes: list[str],
-        tensor_block_dict: dict[str, dict[str, list[int]]] = {},
     ) -> onnx.ModelProto:
         """Convert model to mixed precision.
 
         Args:
             high_precision_nodes: List of node names to keep in high precision.
             low_precision_nodes: List of node names to convert to low precision.
-            tensor_block_dict: Dictionary of tensors (operation type and I/O indices) that should remain in FP32.
 
         Returns:
             onnx.ModelProto: The converted mixed precision model.
@@ -190,7 +196,7 @@ def convert(
                     input.type.tensor_type.elem_type = self.low_precision_type.onnx_type
 
         cast_down_tensors, cast_up_tensors, fp32_input_to_low_precision_node = (
-            self._get_tensors_to_cast(low_precision_nodes, tensor_block_dict)
+            self._get_tensors_to_cast(low_precision_nodes)
         )
         logger.debug(f"cast down (to {self.low_precision_type.str_full}): {cast_down_tensors}")
         logger.debug(f"cast up (to {self.high_precision_type.str_full}): {cast_up_tensors}")
@@ -483,11 +489,8 @@ def _get_tensors_to_cast(
         for node in self.model.graph.node:
             if node.name in low_precision_nodes:
                 # Cast inputs to FP16 nodes down to FP16
-                high_precision_tensor = high_precision_tensors.get(node.op_type, {})
-                for idx, input in enumerate(node.input):
-                    if self._should_skip_low_precision_input_conversion(
-                        node, input
-                    ) or idx in high_precision_tensor.get("inp", []):
+                for input in node.input:
+                    if self._should_skip_low_precision_input_conversion(node, input):
                         cast_to_fp32.append(input)
                         fp32_input_to_low_precision_node[input].append(node)
                     else:
@@ -1280,13 +1283,9 @@ def _sanitize_model(self):
         graph_sanitizer.sanitize()
         self.model = graph_sanitizer.model
 
-    def _should_skip_low_precision_input_conversion(
-        self, node: onnx.NodeProto, input_name: str
-    ) -> bool:
-        """Check if the input should be skipped for low precision conversion.
-
-        This is used for nodes that have inputs that MUST remain in FP32.
-        """
+    def _create_skip_inputs_mapping(self, tensor_block_dict: dict[str, dict[str, list[int]]] = {}):
+        """Create mapping of op types to indices of inputs that should not be converted to low precision."""
+        skip_inputs_map = {}
         match self.low_precision_type.str_short:
             case "fp16":
                 skip_inputs_map = SKIP_LOW_PRECISION_MAPPING_FP16
@@ -1295,12 +1294,27 @@ def _should_skip_low_precision_input_conversion(
             case _:
                 raise ValueError(f"Unsupported low precision type: {self.low_precision_type}")
 
-        if node.op_type in skip_inputs_map:
+        # Update mapping with user-defined information
+        for op, tensor_map in tensor_block_dict.items():
+            high_precision_tensor = tensor_map.get("inp", [])
+            if high_precision_tensor:
+                skip_inputs_map.update({op: set(high_precision_tensor)})
+
+        return skip_inputs_map
+
+    def _should_skip_low_precision_input_conversion(
+        self, node: onnx.NodeProto, input_name: str
+    ) -> bool:
+        """Check if the input should be skipped for low precision conversion.
+
+        This is used for nodes that have inputs that MUST remain in FP32.
+        """
+        if node.op_type in self.skip_inputs_map:
             # Figure out the index of the input in the node input
             inputs_lst = list(node.input)
             if input_name not in inputs_lst:
                 raise ValueError(f"Input {input_name} not found in node {node.name}.")
             input_index = inputs_lst.index(input_name)
             # Check if we should skip this input for low precision conversion
-            return input_index in skip_inputs_map[node.op_type]
+            return input_index in self.skip_inputs_map[node.op_type]
         return False