Added flag '--op_types_to_exclude_fp16' to allow any op to be excluded from FP16 conversion

gcunhase · gcunhase · commit aec1a03c54cc · 2025-09-25T13:08:08.000-04:00
Signed-off-by: gcunhase &lt;4861122+gcunhase@users.noreply.github.com&gt;
diff --git a/modelopt/onnx/quantization/__main__.py b/modelopt/onnx/quantization/__main__.py
@@ -101,6 +101,16 @@ def get_parser() -> argparse.ArgumentParser:
         nargs="+",
         help="A space-separated list of node types to exclude from quantization.",
     )
+    argparser.add_argument(
+        "--op_types_to_exclude_fp16",
+        type=str,
+        default=[],
+        nargs="+",
+        help=(
+            "A space-separated list of node types to exclude from FP16 conversion. "
+            "This is only relevant if '--high_precision_dtype != fp32'."
+        ),
+    )
     argparser.add_argument(
         "--nodes_to_quantize",
         type=str,
@@ -274,6 +284,7 @@ def main():
         override_shapes=args.override_shapes,
         op_types_to_quantize=args.op_types_to_quantize,
         op_types_to_exclude=args.op_types_to_exclude,
+        op_types_to_exclude_fp16=args.op_types_to_exclude_fp16,
         nodes_to_quantize=args.nodes_to_quantize,
         nodes_to_exclude=args.nodes_to_exclude,
         use_external_data_format=args.use_external_data_format,
diff --git a/modelopt/onnx/quantization/fp8.py b/modelopt/onnx/quantization/fp8.py
@@ -168,6 +168,7 @@ def quantize(
     calibration_eps: list[str] = ["cpu", "cuda:0", "trt"],
     op_types_to_quantize: list[str] | None = None,
     op_types_to_exclude: list[str] | None = None,
+    op_types_to_exclude_fp16: list[str] = [],
     nodes_to_quantize: list[str] | None = None,
     nodes_to_exclude: list[str] | None = None,
     use_external_data_format: bool = False,
@@ -178,7 +179,6 @@ def quantize(
     passes: list[str] = ["concat_elimination"],
     log_level: str = "INFO",
     calibrate_per_node: bool = False,
-    custom_ops_to_cast_fp32: list[str] = [],
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     **kwargs,
@@ -319,7 +319,7 @@ def quantize(
         onnx_model = convert_to_f16(
             onnx_model,
             keep_io_types=not direct_io_types,
-            op_block_list=custom_ops_to_cast_fp32,
+            op_block_list=op_types_to_exclude_fp16,
             low_precision_type=high_precision_dtype,
             trt_plugins=trt_extra_plugin_lib_paths,
         )
diff --git a/modelopt/onnx/quantization/int8.py b/modelopt/onnx/quantization/int8.py
@@ -119,6 +119,7 @@ def quantize(
     calibration_eps: list[str] = ["cpu", "cuda:0", "trt"],
     op_types_to_quantize: list[str] | None = None,
     op_types_to_exclude: list[str] | None = None,
+    op_types_to_exclude_fp16: list[str] = [],
     nodes_to_quantize: list[str] | None = None,
     nodes_to_exclude: list[str] | None = None,
     use_external_data_format: bool = False,
@@ -128,7 +129,6 @@ def quantize(
     passes: list[str] = ["concat_elimination"],
     log_level: str = "INFO",
     calibrate_per_node: bool = False,
-    custom_ops_to_cast_fp32: list[str] = [],
     custom_ops_to_quantize: list[str] = [],
     direct_io_types: bool = False,
     **kwargs,
@@ -280,7 +280,7 @@ def quantize(
         onnx_model = convert_to_f16(
             onnx_model,
             keep_io_types=not direct_io_types,
-            op_block_list=custom_ops_to_cast_fp32,
+            op_block_list=op_types_to_exclude_fp16,
             low_precision_type=high_precision_dtype,
             trt_plugins=trt_extra_plugin_lib_paths,
         )
diff --git a/modelopt/onnx/quantization/quantize.py b/modelopt/onnx/quantization/quantize.py
@@ -216,6 +216,7 @@ def quantize(
     override_shapes: str | None = None,
     op_types_to_quantize: list[str] | None = None,
     op_types_to_exclude: list[str] | None = None,
+    op_types_to_exclude_fp16: list[str] | None = None,
     nodes_to_quantize: list[str] | None = None,
     nodes_to_exclude: list[str] | None = None,
     use_external_data_format: bool = False,
@@ -267,6 +268,9 @@ def quantize(
             This flag does not support regular expression.
         op_types_to_exclude:
             List of op types to exclude from quantization. This flag does not support regular expression.
+        op_types_to_exclude_fp16:
+            List of op types to exclude from FP16 conversion.
+            This is only relevant if '--high_precision_dtype != fp32'.
         nodes_to_quantize:
             List of node names to quantize. If None (default), all supported nodes are quantized.
             This flag supports regular expression.
@@ -422,6 +426,8 @@ def quantize(
         quantize_mode,
     )
     trt_plugins = update_trt_ep_support(calibration_eps, has_dds_op, has_custom_op, trt_plugins)  # type: ignore[arg-type]
+    op_types_to_exclude_fp16 = op_types_to_exclude_fp16 or []
+    op_types_to_exclude_fp16.extend(list(custom_ops_to_cast_fp32.keys()))
 
     # Use random scales if calibration data is not supplied
     if calibration_data is None:
@@ -464,6 +470,7 @@ def quantize(
             calibration_eps=calibration_eps,
             op_types_to_quantize=op_types_to_quantize,
             op_types_to_exclude=op_types_to_exclude,
+            op_types_to_exclude_fp16=op_types_to_exclude_fp16,
             nodes_to_quantize=nodes_to_quantize,
             nodes_to_exclude=nodes_to_exclude,
             use_external_data_format=use_external_data_format,
@@ -474,7 +481,6 @@ def quantize(
             passes=passes,
             log_level=log_level,
             calibrate_per_node=calibrate_per_node,
-            custom_ops_to_cast_fp32=list(custom_ops_to_cast_fp32.keys()),
             custom_ops_to_quantize=list(custom_ops_to_quantize.keys()),
             direct_io_types=direct_io_types,
             **kwargs,