NVIDIA · kevalmorabia97 · Aug 16, 2025 · Aug 16, 2025
diff --git a/CHANGELOG-Windows.rst b/CHANGELOG-Windows.rst
@@ -2,6 +2,14 @@
 Model Optimizer Changelog (Windows)
 ===================================
 
+0.33 (2025-07-21)
+^^^^^^^^^^^^^^^^^
+
+**New Features**
+
+- TensorRT Model Optimizer for Windows now supports `NvTensorRtRtx <https://onnxruntime.ai/docs/execution-providers/TensorRTRTX-ExecutionProvider.html>`_ execution-provider.
+
+
 0.27 (2025-04-30)
 ^^^^^^^^^^^^^^^^^
 

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,13 +8,15 @@ Model Optimizer Changelog (Linux)
 
 **Deprecations**
 
-- Deprecate ``torch<2.5`` support.
+- Deprecate ``torch<2.6`` support.
 
 **New Features**
 
 - (Experimental) Add quantization support for custom TensorRT op in ONNX models.
 - Add support for Minifinetuning (MFT; https://arxiv.org/abs/2506.15702) self-corrective distillation, which enables training on small datasets with severely mitigated catastrophic forgetting.
 - Add tree decoding support for Megatron Eagle models.
+- For most VLMs, we now explicitly disable quant on the vision part so we add them to the excluded_modules during HF export.
+- Add support for ``hidden_size`` and ``num_layers`` pruning for Megatron Core Mamba models in ``mcore_gpt_minitron`` mode.
 
 0.33 (2025-07-14)
 ^^^^^^^^^^^^^^^^^
@@ -36,6 +38,7 @@ Model Optimizer Changelog (Linux)
 - ModelOpt now supports quantization of tensor-parallel sharded Huggingface transformer models. This requires ``transformers>=4.52.0``.
 - Support quantization of FSDP2 wrapped models and add FSDP2 support in the ``llm_qat`` example.
 - Add NeMo 2 Simplified Flow examples for quantization aware training/distillation (QAT/QAD), speculative decoding, pruning & distillation.
+- Fix a Qwen3 MOE model export issue.
 
 0.31 (2025-06-04)
 ^^^^^^^^^^^^^^^^^

diff --git a/docs/source/getting_started/_installation_for_Linux.rst b/docs/source/getting_started/_installation_for_Linux.rst
@@ -16,7 +16,7 @@ Latest Model Optimizer (``nvidia-modelopt``) currently has the following system
 +-------------------------+-----------------------------+
 | CUDA                    |  >=12.0                     |
 +-------------------------+-----------------------------+
-| PyTorch                 |  >=2.4                      |
+| PyTorch                 |  >=2.6                      |
 +-------------------------+-----------------------------+
 | TensorRT-LLM (Optional) |  0.20                       |
 +-------------------------+-----------------------------+

diff --git a/docs/source/guides/6_save_load.rst b/docs/source/guides/6_save_load.rst
@@ -166,6 +166,13 @@ Here is an example of how to enable ModelOpt save/restore with the Huggingface A
     # Save the ModelOpt-modified model architecture and weights using Huggingface APIs
     model.save_pretrained(f"ModelOpt_{model_path}")
 
+By default, the modelopt state is saved in the same directory as the model weights.
+You can disable this by setting the ``save_modelopt_state`` to ``False`` in the ``save_pretrained`` API, as shown below:
+
+.. code-block:: python
+
+    model.save_pretrained(f"ModelOpt_{model_path}", save_modelopt_state=False)
+
 The model saved as above can be restored using the Huggingface ``from_pretrained`` API.
 Do not forget to call :meth:`mto.enable_huggingface_checkpointing() <modelopt.torch.opt.plugins.huggingface.enable_huggingface_checkpointing>`
 before loading the model. This needs to be done only once in the program.

diff --git a/docs/source/guides/8_autocast.rst b/docs/source/guides/8_autocast.rst
@@ -2,7 +2,7 @@ AutoCast (ONNX)
 ###############
 
 AutoCast is a tool for converting FP32 ONNX models to mixed precision FP32-FP16 or FP32-BF16 models.
-While casting FP32 to FP6/BF16, some nodes might be more sensitive to effecting accuracy.
+While casting FP32 to FP16/BF16, some nodes might be more sensitive to effecting accuracy.
 AutoCast intelligently selects nodes to keep in FP32 precision to maintain model accuracy while benefiting from
 reduced precision on the rest of the nodes. AutoCast automatically injects cast operations around the selected
 nodes.

@@ -18,15 +18,10 @@
 from typing import Any
 
 import torch
+import transformers
 from accelerate import infer_auto_device_map, init_empty_weights
 from accelerate.utils import get_max_memory
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    AutoProcessor,
-    AutoTokenizer,
-    Llama4ForConditionalGeneration,
-)
+from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 
@@ -148,7 +143,7 @@ def get_model(
     if device == "cpu":
         device_map = "cpu"
 
-    config_kwargs = {"trust_remote_code": trust_remote_code}
+    config_kwargs = {"trust_remote_code": trust_remote_code} if trust_remote_code else {}
     if attn_implementation is not None:
         config_kwargs["attn_implementation"] = attn_implementation
 
@@ -182,61 +177,24 @@ def get_model(
             max_memory = {key: value * gpu_mem_percentage for key, value in max_memory.items()}
             model_kwargs["max_memory"] = max_memory
 
+        if hf_config.model_type == "bart":
+            # device_map "auto" and "cuda" triggers error regarding meta tensor from safetensors
+            device_map = None
+
         if is_speculative(hf_config):
             model = AutoModelForCausalLM.from_pretrained(
                 ckpt_path,
                 device_map=device_map,
                 **model_kwargs,
             )
-        elif hf_config.model_type == "llava":
-            from transformers import LlavaForConditionalGeneration
-
-            hf_llava = LlavaForConditionalGeneration.from_pretrained(
-                ckpt_path, device_map=device_map, **model_kwargs
-            )
-            model = hf_llava.language_model
-        elif hf_config.model_type == "t5":
-            from transformers import AutoModelForSeq2SeqLM
-
-            model = AutoModelForSeq2SeqLM.from_pretrained(
-                ckpt_path, device_map=device_map, **model_kwargs
-            )
-        elif hf_config.model_type == "bart":
-            from transformers import AutoModelForSeq2SeqLM
-
-            # device_map "auto" and "cuda" triggers error regarding meta tensor from safetensors
-            model = AutoModelForSeq2SeqLM.from_pretrained(
-                ckpt_path, device_map=None, **model_kwargs
-            ).to(device)
-        elif hf_config.model_type == "whisper":
-            from transformers import WhisperForConditionalGeneration
-
-            model = WhisperForConditionalGeneration.from_pretrained(
-                ckpt_path, device_map=device_map, **model_kwargs
-            )
-        elif hf_config.model_type == "glm":
-            from transformers import AutoModelForSeq2SeqLM
+        else:
+            architecture = hf_config.architectures[0]
 
-            model = AutoModelForSeq2SeqLM.from_pretrained(
-                ckpt_path,
-                device_map="cuda",
-                **model_kwargs,
+            assert hasattr(transformers, architecture), (
+                f"Architecture {architecture} not found in transformers: {transformers.__version__}"
             )
-        elif hf_config.model_type == "mllama":
-            from transformers import MllamaForConditionalGeneration
+            auto_model_module = getattr(transformers, architecture)
 
-            model = MllamaForConditionalGeneration.from_pretrained(
-                ckpt_path,
-                device_map=device_map,
-                **model_kwargs,
-            )
-        elif hf_config.model_type == "llama4":
-            model = Llama4ForConditionalGeneration.from_pretrained(
-                ckpt_path,
-                device_map=device_map,
-                **model_kwargs,
-            )
-        else:
             with init_empty_weights():
                 # When computing the device_map, assuming half precision by default,
                 # unless specified by the hf_config.
@@ -246,7 +204,7 @@ def get_model(
                 # DeciLMForCausalLM does not support max_memory argument
                 if "architectures" in hf_config and "DeciLMForCausalLM" in hf_config.architectures:
                     model_kwargs2.pop("max_memory", None)
-                model = AutoModelForCausalLM.from_config(
+                model = auto_model_module._from_config(
                     hf_config,
                     **model_kwargs2,
                 )
@@ -269,7 +227,7 @@ def get_model(
                 )
                 model_kwargs["max_memory"] = max_memory
 
-            model = AutoModelForCausalLM.from_pretrained(
+            model = auto_model_module.from_pretrained(
                 ckpt_path,
                 device_map=device_map,
                 **model_kwargs,

@@ -46,6 +46,7 @@
     create_forward_loop,
     get_dataset_dataloader,
     get_max_batch_size,
+    get_supported_datasets,
 )
 from modelopt.torch.utils.image_processor import MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
@@ -195,6 +196,9 @@ def main(args):
     # launch a memory monitor to read the currently used GPU memory.
     launch_memory_monitor()
 
+    # Force eager execution for all model types.
+    torch.compiler.set_stance("force_eager")
+
     # Check that only one quantization format is provided for non auto_quant case
     if not args.auto_quantize_bits:
         assert len(args.qformat.split(",")) == 1, (
@@ -267,14 +271,6 @@ def main(args):
     full_model = model
 
     if model_type == "mllama":
-        if args.dataset is None:
-            args.dataset = "scienceqa"
-            warnings.warn(
-                "Currently only the scienceqa dataset is supported for the mllama model. "
-                "Overriding dataset to scienceqa."
-            )
-        elif args.dataset != "scienceqa":
-            raise ValueError("Only the scienceqa dataset is supported for the mllama model.")
         processor = get_processor(
             args.pyt_ckpt_path,
             model_type,
@@ -283,20 +279,12 @@ def main(args):
             attn_implementation=args.attn_implementation,
         )
     elif model_type == "whisper":
-        if args.dataset is None:
-            args.dataset = "peoples_speech"
-            warnings.warn(
-                "Currently only the peoples_speech dataset is supported for the whisper model. "
-                "Overriding dataset to peoples_speech."
-            )
-        elif args.dataset != "peoples_speech":
-            raise ValueError("Only the peoples_speech dataset is supported for the whisper model.")
         processor = get_processor(
             args.pyt_ckpt_path, model_type, device, trust_remote_code=args.trust_remote_code
         )
     else:
         if args.dataset is None:
-            args.dataset = "cnn_dailymail"
+            args.dataset = ["cnn_dailymail"]
             warnings.warn("No dataset specified. Defaulting to cnn_dailymail.")
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
         default_padding_side = tokenizer.padding_side
@@ -305,16 +293,31 @@ def main(args):
 
         # We only quantize the language model for VLMs other than the type supported above.
         if hasattr(model, "language_model"):
-            assert model_type == "llama4", (
-                "Only llama4 should reach here. Please uncomment this check if you are modelopt developers."
-            )
+            parent_model = model  # llama4 case
+            if isinstance(type(model).__dict__.get("language_model"), property):
+                assert hasattr(model, "model") and hasattr(model.model, "language_model"), (
+                    "Expected language_model in model.model, but attribute not found. "
+                    "This may indicate an unsupported model structure."
+                )
+                parent_model = model.model  # gemma3, qwen2.5 VL case
+
+            disabled_quant_cfg = {
+                "quant_cfg": {"default": {"enable": False}},
+                "algorithm": "max",
+            }
+
+            for name, child in parent_model.named_children():
+                # Apply disabled quant to all children except language_model so we can exclude them during HF export.
+                if name != "language_model":
+                    mtq.quantize(child, disabled_quant_cfg, forward_loop=None)
+
             model = model.language_model
 
     if args.sparsity_fmt != "dense":
         if args.batch_size == 0:
             # Sparse algorithm takes more GPU memory so we reduce the batch_size by 4.
             args.batch_size = max(get_max_batch_size(model) // 4, 1)
-            args.batch_size = min(args.batch_size, args.calib_size)
+            args.batch_size = min(args.batch_size, sum(args.calib_size))
 
         print(f"Use calib batch_size {args.batch_size}")
 
@@ -373,7 +376,7 @@ def main(args):
                 sample_input_single_batch=sample_input_single_batch,
                 enable_grad=run_auto_quant,
             )
-            args.batch_size = min(args.batch_size, args.calib_size)
+            args.batch_size = min(args.batch_size, sum(args.calib_size))
 
         print(f"Use calib batch_size {args.batch_size}")
 
@@ -383,17 +386,17 @@ def main(args):
                 "The MllamaImageProcessor must be set."
             )
             calib_dataloader = get_vlm_dataset_dataloader(
-                dataset_name=args.dataset,
+                dataset_name=args.dataset[0] if args.dataset else "scienceqa",
                 processor=processor,
                 batch_size=args.batch_size,
-                num_samples=args.calib_size,
+                num_samples=args.calib_size[0],
             )
         elif model_type == "whisper":
             assert processor is not None and isinstance(processor, WhisperProcessor), (
                 "The AutoProcessor must be set."
             )
             calib_dataloader, first_text = get_speech_dataset_dataloader(
-                dataset_name=args.dataset,
+                dataset_name=args.dataset[0] if args.dataset else "peoples_speech",
                 processor=processor,
                 batch_size=args.batch_size,
                 num_samples=args.calib_size,
@@ -454,7 +457,7 @@ def main(args):
                 "input_features" if model_type == "whisper" else "input_ids"
             ][0:1]
             try:
-                generated_ids_before_ptq = model.generate(input_ids, max_new_tokens=100)
+                generated_ids_before_ptq = full_model.generate(input_ids, max_new_tokens=100)
             except Exception as e:
                 print(
                     "Error during model generation. Please check if your transformers version is "
@@ -472,7 +475,8 @@ def main(args):
             torch.cuda.empty_cache()
             generated_ids_after_ptq = None
             if model_type != "llama4":
-                generated_ids_after_ptq = model.generate(input_ids, max_new_tokens=100)
+                # Our fake quantizer may not be fully compatible with torch.compile.
+                generated_ids_after_ptq = full_model.generate(input_ids, max_new_tokens=100)
             else:
                 warnings.warn(
                     "Llama4 Maverick generation after quantization has a bug. Skipping generation sample."
@@ -600,15 +604,23 @@ def output_decode(generated_ids, input_shape):
         default=0,
     )
     parser.add_argument(
-        "--calib_size", help="Number of samples for calibration.", type=int, default=512
+        "--calib_size",
+        help=(
+            "Number of samples for calibration. If a comma separated list of values is provided, "
+            "each value will be used as the calibration size for the corresponding dataset."
+        ),
+        type=str,
+        default="512",
     )
     parser.add_argument("--export_path", default="exported_model")
     parser.add_argument(
         "--dataset",
-        help="name of dataset.",
+        help=(
+            f"name of a dataset, or a comma separated list of datasets. "
+            f"dataset choices are {get_supported_datasets()}"
+        ),
         type=str,
         default=None,
-        choices=["magpie", "cnn_dailymail", "pile", "pg19", "wikipedia"],
     )
     parser.add_argument("--inference_tensor_parallel", type=int, default=1)
     parser.add_argument("--inference_pipeline_parallel", type=int, default=1)
@@ -695,4 +707,6 @@ def output_decode(generated_ids, input_shape):
 
     args = parser.parse_args()
 
+    args.dataset = args.dataset.split(",") if args.dataset else None
+    args.calib_size = [int(num_sample) for num_sample in args.calib_size.split(",")]
     main(args)
@@ -66,7 +66,7 @@ def run(args):
 
     print("TensorRT-LLM example outputs:")
 
-    llm = LLM(args.engine_dir, tokenizer=tokenizer)
+    llm = LLM(args.engine_dir, tokenizer=tokenizer, max_batch_size=len(input_texts))
     torch.cuda.cudart().cudaProfilerStart()
     outputs = llm.generate_text(input_texts, args.max_output_len)
     torch.cuda.cudart().cudaProfilerStop()

@@ -166,13 +166,14 @@ if [[ "${DISTILL}" == "True" ]]; then
   FSDP_ARGS="$FSDP_ARGS --fsdp_cpu_ram_efficient_loading False"
 fi
 
-# real quantization does not work with FSDP
-if [[ "${COMPRESS,,}" == "true" ]]; then
-  echo "Compression is not supported with FSDP. Disabling FSDP."
+# real quantization does not work with FSDP, only works with FSDP2
+if [[ "${COMPRESS,,}" == "true" && "${USE_FSDP2,,}" != "true" ]]; then
+  echo "Compression is not supported with FSDP. Disabling FSDP and using DDP."
   FSDP_ARGS=""
   CONFIG_FILE="ddp.yaml"
 fi
 
+
 CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \
     main.py \
     --model_name_or_path $MODEL \