NVIDIA · kevalmorabia97 · Aug 29, 2025 · Aug 29, 2025
@@ -43,8 +43,9 @@ examples/llm_eval @NVIDIA/modelopt-examples-llm_ptq-codeowners
 examples/llm_ptq @NVIDIA/modelopt-examples-llm_ptq-codeowners
 examples/llm_qat @NVIDIA/modelopt-examples-llm_qat-codeowners
 examples/llm_sparsity @NVIDIA/modelopt-torch-sparsity-codeowners
+examples/megatron-lm @NVIDIA/modelopt-examples-megatron-codeowners
 examples/model_hub @NVIDIA/modelopt-examples-model_hub-codeowners
-examples/nemo_run @NVIDIA/modelopt-examples-nemo_run-codeowners
+examples/nemo_run @NVIDIA/modelopt-examples-megatron-codeowners
 examples/onnx_ptq @NVIDIA/modelopt-onnx-codeowners
 examples/pruning @NVIDIA/modelopt-torch-nas-prune-codeowners
 examples/speculative_decoding @NVIDIA/modelopt-torch-speculative-codeowners

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -26,6 +26,7 @@ Model Optimizer Changelog (Linux)
 - Add support for ``mamba_num_heads``, ``mamba_head_dim``, ``hidden_size`` and ``num_layers`` pruning for Megatron Core Mamba or Hybrid Transformer Mamba models in ``mcore_minitron`` (previously ``mcore_gpt_minitron``) mode.
 - Add example for QAT/QAD training with `LLaMA Factory <https://github.com/hiyouga/LLaMA-Factory/tree/main>`_. See ``examples/llm_qat/llama_factory`` for more details.
 - Upgrade TensorRT-LLM dependency to 1.0.0rc6.
+- Add unified HuggingFace model export support for quantized NVFP4 GPT-OSS models.
 
 0.33 (2025-07-14)
 ^^^^^^^^^^^^^^^^^

@@ -1107,6 +1107,7 @@ def main(input_args: list[str] | None = None) -> None:
         format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
         datefmt="%m/%d/%Y %H:%M:%S",
         level=logging.INFO,
+        force=True,
     )
     logger.info(accelerator.state, main_process_only=False)
     if accelerator.is_local_main_process:

@@ -22,7 +22,7 @@
     remove_nesting,
     update_dynamic_axes,
 )
-from quantize import create_pipeline
+from quantize import ModelType, PipelineManager
 
 import modelopt.torch.opt as mto
 from modelopt.torch._deploy._runtime import RuntimeRegistry
@@ -31,6 +31,20 @@
 from modelopt.torch._deploy.device_model import DeviceModel
 from modelopt.torch._deploy.utils import get_onnx_bytes_and_metadata
 
+MODEL_ID = {
+    "sdxl-1.0": ModelType.SDXL_BASE,
+    "sdxl-turbo": ModelType.SDXL_TURBO,
+    "sd3-medium": ModelType.SD3_MEDIUM,
+    "flux-dev": ModelType.FLUX_DEV,
+    "flux-schnell": ModelType.FLUX_SCHNELL,
+}
+
+dtype_map = {
+    "Half": torch.float16,
+    "BFloat16": torch.bfloat16,
+    "Float": torch.float32,
+}
+
 
 def generate_image(pipe, prompt, image_name):
     seed = 42
@@ -91,7 +105,7 @@ def main():
 
     image_name = args.save_image_as if args.save_image_as else f"{args.model}.png"
 
-    pipe = create_pipeline(args.model, args.model_dtype, args.override_model_path)
+    pipe = PipelineManager.create_pipeline_from(MODEL_ID[args.model], dtype_map[args.model_dtype])
 
     # Save the backbone of the pipeline and move it to the GPU
     add_embedding = None

@@ -306,6 +306,37 @@ def __init__(self, config: ModelConfig, logger: logging.Logger):
         self.pipe: DiffusionPipeline | None = None
         self.pipe_upsample: LTXLatentUpsamplePipeline | None = None  # For LTX-Video upsampling
 
+    @staticmethod
+    def create_pipeline_from(
+        model_type: ModelType, torch_dtype: torch.dtype = torch.bfloat16
+    ) -> DiffusionPipeline:
+        """
+        Create and return an appropriate pipeline based on configuration.
+
+        Returns:
+            Configured diffusion pipeline
+
+        Raises:
+            ValueError: If model type is unsupported
+        """
+        try:
+            model_id = MODEL_REGISTRY[model_type]
+            if model_type == ModelType.SD3_MEDIUM:
+                pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
+            elif model_type in [ModelType.FLUX_DEV, ModelType.FLUX_SCHNELL]:
+                pipe = FluxPipeline.from_pretrained(model_id, torch_dtype=torch_dtype)
+            else:
+                # SDXL models
+                pipe = DiffusionPipeline.from_pretrained(
+                    model_id,
+                    torch_dtype=torch_dtype,
+                    use_safetensors=True,
+                )
+            pipe.set_progress_bar_config(disable=True)
+            return pipe
+        except Exception as e:
+            raise e
+
     def create_pipeline(self) -> DiffusionPipeline:
         """
         Create and return an appropriate pipeline based on configuration.

@@ -49,6 +49,8 @@ model = mtq.quantize(model, config, forward_loop)
 train(model, train_loader, optimizer, scheduler, ...)
 ```
 
+For an end to end example showcasing the above workflow, checkout [qat-finetune-transformers.ipynb](/examples/gpt-oss/qat-finetune-transformers.ipynb).
+
 If you are training Huggingface models with trainer classes from Huggingface such as [SFTTrainer](https://huggingface.co/docs/trl/en/sft_trainer) performing QAT is even easier - simply replace the trainer with its equivalent, `QATSFTTrainer` from ModelOpt and specify additional quantization arguments to it. `QATSFTTrainer` will perform the necessary quantization steps in the backend and train the model just like the original `SFTTrainer`.
 
 A real end-to-end example for this is in `sft.py` in this folder. To perform QAT with full parameter SFT on GPT-OSS 20B model, run:

@@ -23,11 +23,8 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
 from utils import get_original_huggingface_quant_method
 
-import modelopt.torch.opt as mto
 from modelopt.torch.quantization.qtensor import MXFP4QTensor
 
-mto.enable_huggingface_checkpointing()
-
 
 def _to_oai_mxfp4_weight_only(model, block_size=32):
     new_state_dict = {}
@@ -36,15 +33,20 @@ def _to_oai_mxfp4_weight_only(model, block_size=32):
         # Only convert experts weights, skip bias and other modules
         if "experts" in name and "bias" not in name:
             param = param.transpose(-1, -2).contiguous()
-            quantized, scales = MXFP4QTensor.quantize(param, block_size=block_size)
-
-            shape = quantized._quantized_data.shape
+            quantized_tensors = []
+            scales_tensors = []
+            for expert in param:
+                quantized, scales = MXFP4QTensor.quantize(expert, block_size=block_size)
+                quantized_tensors.append(quantized._quantized_data)
+                scales_tensors.append(scales)
+            quantized = torch.stack(quantized_tensors)
+            scales = torch.stack(scales_tensors)
+
+            shape = quantized.shape
             # Add converted weights and scales to state_dict
             new_state_dict.update(
                 {
-                    f"{name}_blocks": quantized._quantized_data.view(
-                        shape[0], shape[1], -1, block_size // 2
-                    ).cpu(),
+                    f"{name}_blocks": quantized.view(shape[0], shape[1], -1, block_size // 2).cpu(),
                     f"{name}_scales": scales.view(shape[0], shape[1], -1).cpu(),
                 }
             )
@@ -134,6 +136,8 @@ def create_parser():
     if args.lora_path:
         model = PeftModel.from_pretrained(model, args.lora_path)
         model = model.merge_and_unload()  # Merge LoRA-QAT adapter weights to base model
+        torch.cuda.empty_cache()
+        gc.collect()
 
     # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)