Insert cast nodes for 'FP32 required' custom ops

gcunhase · gcunhase · commit 62564db6a826 · 2025-09-25T09:42:19.000-04:00
Signed-off-by: gcunhase &lt;4861122+gcunhase@users.noreply.github.com&gt;
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
@@ -178,6 +178,7 @@ def quantize(
     passes: list[str] = ["concat_elimination"],
     log_level: str = "INFO",
     calibrate_per_node: bool = False,
+    custom_ops_to_cast_fp32: list[str] = [],
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     **kwargs,
@@ -318,7 +319,7 @@ def quantize(
         onnx_model = convert_to_f16(
             onnx_model,
             keep_io_types=not direct_io_types,
-            op_block_list=["Resize"],
+            op_block_list=custom_ops_to_cast_fp32,
             low_precision_type=high_precision_dtype,
             trt_plugins=trt_extra_plugin_lib_paths,
         )
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
@@ -128,6 +128,7 @@ def quantize(
     passes: list[str] = ["concat_elimination"],
     log_level: str = "INFO",
     calibrate_per_node: bool = False,
+    custom_ops_to_cast_fp32: list[str] = [],
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     **kwargs,
@@ -279,6 +280,7 @@ def quantize(
         onnx_model = convert_to_f16(
             onnx_model,
             keep_io_types=not direct_io_types,
+            op_block_list=custom_ops_to_cast_fp32,
             low_precision_type=high_precision_dtype,
             trt_plugins=trt_extra_plugin_lib_paths,
         )
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
@@ -81,7 +81,7 @@ def _preprocess_onnx(
     override_shapes: str,
     simplify: bool = False,
     quantize_mode: str = "int8",
-) -> tuple[str, onnx.ModelProto, list[str], bool, bool, bool, dict]:
+) -> tuple[str, onnx.ModelProto, list[str], bool, bool, bool, dict, dict]:
     logger.info(f"Preprocessing the model {onnx_path}")
     intermediate_generated_files = []
     output_dir = os.path.dirname(output_path)
@@ -180,13 +180,14 @@ def _preprocess_onnx(
         intermediate_generated_files.append(onnx_path)
 
     # If custom op precisions are given, add Cast or Q/DQ where appropriate.
+    custom_ops_to_cast = {}
     custom_ops_to_quantize = {}
     if trt_plugins_precision:
         custom_ops_to_cast, custom_ops_to_quantize = interpret_trt_plugins_precision_flag(
             onnx_model, trt_plugins_precision, quantize_mode
         )
-        if custom_ops_to_cast:
-            onnx_model = cast_custom_ops(onnx_model, custom_ops_to_cast)
+        if custom_ops_to_cast.get("fp16", {}):
+            onnx_model = cast_custom_ops(onnx_model, custom_ops_to_cast["fp16"])
             onnx_path = os.path.join(output_dir, f"{model_name}_castFP16.onnx")
             save_onnx(onnx_model, onnx_path, use_external_data_format)
             logger.info(f"Model is cloned to {onnx_path} after casting tensors to FP16")
@@ -199,6 +200,7 @@ def _preprocess_onnx(
         has_custom_op,
         has_dds_op,
         use_external_data_format,
+        custom_ops_to_cast.get("fp32", {}),
         custom_ops_to_quantize,
     )
 
@@ -406,6 +408,7 @@ def quantize(
         has_custom_op,
         has_dds_op,
         use_external_data_format,
+        custom_ops_to_cast_fp32,
         custom_ops_to_quantize,
     ) = _preprocess_onnx(
         onnx_path,
@@ -471,6 +474,7 @@ def quantize(
             passes=passes,
             log_level=log_level,
             calibrate_per_node=calibrate_per_node,
+            custom_ops_to_cast_fp32=list(custom_ops_to_cast_fp32.keys()),
             custom_ops_to_quantize=list(custom_ops_to_quantize.keys()),
             direct_io_types=direct_io_types,
             **kwargs,
diff --git a/modelopt/onnx/trt_utils.py b/modelopt/onnx/trt_utils.py
@@ -367,10 +367,12 @@ def interpret_trt_plugins_precision_flag(
         if trt_plugin_precision.count(":") == 1:
             if precision not in supported_precisions:
                 logger.warning(f"Precision {precision} is not supported. Skipping.")
-            if precision == "fp16":
-                custom_ops_to_cast[op_type] = {
-                    "inp": list(range(num_inps)),
-                    "out": list(range(num_outs)),
+            if precision in ["fp16", "fp32"]:
+                custom_ops_to_cast[precision] = {
+                    op_type: {
+                        "inp": list(range(num_inps)),
+                        "out": list(range(num_outs)),
+                    }
                 }
             if precision in ["int8", "fp8"]:
                 if precision != quantize_mode:
@@ -408,10 +410,14 @@ def interpret_trt_plugins_precision_flag(
                     f"Setting the custom op precision to be the same as quantize mode."
                 )
 
-            # Will cast the inputs to FP16 and the outputs back to FP32
-            inp_precision_cast = [i for i, p in enumerate(inp_precision) if p == "fp16"]
-            out_precision_cast = [i for i, p in enumerate(out_precision) if p in ["fp16", "fp32"]]
-            custom_ops_to_cast[op_type] = {"inp": inp_precision_cast, "out": out_precision_cast}
+            # Will cast the inputs to FP16/FP32 and the outputs back to FP32
+            for precision in ["fp16", "fp32"]:
+                inp_precision_cast = [i for i, p in enumerate(inp_precision) if p == precision]
+                out_precision_cast = [i for i, p in enumerate(out_precision) if p == precision]
+                if inp_precision_cast:
+                    custom_ops_to_cast[precision] = {
+                        op_type: {"inp": inp_precision_cast, "out": out_precision_cast}
+                    }
 
             # Will add Q/DQ nodes in the requested I/O indices
             inp_precision_quant = [i for i, p in enumerate(inp_precision) if p in ["int8", "fp8"]]