Fix onnx quantize

chtruong814 · chtruong814 · commit c119095bc3ed · 2026-02-13T15:27:26.000Z
Signed-off-by: Charlie Truong &lt;chtruong@nvidia.com&gt;
diff --git a/nemo_export/onnx_llm_exporter.py b/nemo_export/onnx_llm_exporter.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import contextlib
 import logging
 import warnings
 from pathlib import Path
@@ -40,12 +41,14 @@
 
 try:
     import modelopt.torch.quantization as mtq
+    from modelopt.torch.quantization.export_onnx import configure_linear_module_onnx_quantizers
 
     HAVE_MODELOPT = True
 except (ImportError, ModuleNotFoundError):
     from unittest.mock import MagicMock
 
     mtq = MagicMock()
+    configure_linear_module_onnx_quantizers = MagicMock()
     HAVE_MODELOPT = False
 
 
@@ -135,6 +138,8 @@ def __init__(
         self.model_output_names = None
         self.onnx_runtime_session = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self._is_quantized = False
+        self._quant_cfg_short_name = ""
 
         if self.model_name_or_path is not None:
             if model is not None:
@@ -229,7 +234,11 @@ def _export_to_onnx(
 
         Path(self.onnx_model_dir).mkdir(parents=True, exist_ok=True)
 
-        with torch.autocast(device_type=get_model_device_type(self.model), dtype=export_dtype):
+        quantizer_context = contextlib.nullcontext()
+        if self._is_quantized and (self._quant_cfg_short_name == "nvfp4" or self._quant_cfg_short_name == "mxfp8"):
+            quantizer_context = configure_linear_module_onnx_quantizers(self.model)
+
+        with quantizer_context, torch.autocast(device_type=get_model_device_type(self.model), dtype=export_dtype):
             torch.onnx.export(
                 model=self.model,
                 args=(example_inputs,),
@@ -239,6 +248,7 @@ def _export_to_onnx(
                 dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
                 verbose=verbose,
                 opset_version=opset,
+                dynamo=not self._is_quantized,
             )
         logger.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
 
@@ -484,13 +494,15 @@ def quantize(
 
         quant_cfg_choices = get_quant_cfg_choices()
         if isinstance(quant_cfg, str):
+            self._quant_cfg_short_name = quant_cfg
             assert quant_cfg in quant_cfg_choices, (
                 f"Quantization config {quant_cfg} is not supported. Supported configs: {list(quant_cfg_choices)}"
             )
             quant_cfg = quant_cfg_choices[quant_cfg]
 
         logger.info("Starting quantization...")
         mtq.quantize(self.model, quant_cfg, forward_loop=forward_loop)
+        self._is_quantized = True
         logger.info("Quantization is completed.")
 
     @property