NVIDIA
diff --git a/‎examples/llm_ptq/multinode-ptq.py‎
Lines changed: 24 additions & 38 deletions b/‎examples/llm_ptq/multinode-ptq.py‎
Lines changed: 24 additions & 38 deletions
diff --git a/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 1 addition & 3 deletions b/‎modelopt/torch/export/layer_utils.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 16 additions & 5 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 16 additions & 5 deletions
@@ -21,9 +21,7 @@
 import modelopt.torch.opt as mto
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export import get_model_type
-from modelopt.torch.export.convert_hf_config import convert_hf_quant_config_format
-from modelopt.torch.export.quant_utils import postprocess_state_dict
-from modelopt.torch.export.unified_export_hf import _export_hf_checkpoint
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
 from modelopt.torch.quantization.config import need_calibration
 from modelopt.torch.quantization.utils import patch_fsdp_mp_dtypes
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader, get_supported_datasets
@@ -121,11 +119,6 @@ def parse_args():
         action="store_true",
         help="Trust remote code for HuggingFace models",
     )
-    parser.add_argument(
-        "--attn_implementation",
-        type=str,
-        help="Attention implementation to use (passed to HF model loading)",
-    )
     parser.add_argument("--awq_block_size", default=0, type=int)
 
     args = parser.parse_args()
@@ -159,6 +152,8 @@ def load_and_prepare_model(
     )
     model.eval()
     model_type = get_model_type(model)
+    # Need the original architectures for export
+    # FSDP prefix is added to the architectures for FSDP2 wrapped models
     original_architectures = model.config.architectures
 
     # FSDP2 requires an optimizer to be prepared together with the model
@@ -274,6 +269,8 @@ def calibrate(unwrapped_model):
                     for k, v in batch.items()
                 }
             # Use outer model (FSDP-wrapped), not the parameter
+            # Important: We should forward pass using the unwrapped model
+            # mtq.quantize will unwrap the model & pass to the forward_loop
             model(**batch)
 
     return calibrate
@@ -293,41 +290,27 @@ def export_model(
         export_path: Directory to export model to
     """
     export_dir = Path(export_path)
-    export_dir.mkdir(parents=True, exist_ok=True)
 
     # Get quantization config
-    _, hf_quant_config = _export_hf_checkpoint(model, dtype=torch.bfloat16)
-
-    # Gather and post-process state dict
-    model_state_dict = accelerator.get_state_dict(model)
-    post_state_dict = postprocess_state_dict(model_state_dict, 1.0, None)
-
-    # Save quantization config
-    if accelerator.is_main_process:
-        with open(export_dir / "hf_quant_config.json", "w") as f:
-            json.dump(hf_quant_config, f, indent=4)
-
-        # Convert config format
-        hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
-
-        # Save model
-        model.save_pretrained(
-            export_dir,
-            state_dict=post_state_dict,
-            save_modelopt_state=False,
-        )
+    export_hf_checkpoint(
+        model,
+        dtype=torch.bfloat16,
+        export_dir=export_dir,
+        save_modelopt_state=False,
+        is_fsdp2=True,
+        accelerator=accelerator,
+    )
 
-        # Update config with quantization info
-        config_path = export_dir / "config.json"
-        with open(config_path) as f:
-            config_data = json.load(f)
+    # Update config with quantization info
+    config_path = export_dir / "config.json"
+    with open(config_path) as f:
+        config_data = json.load(f)
 
-        config_data["quantization_config"] = hf_quant_config
-        # Update architectures with original architecture. FSDP prefix must be removed for FSDP wrapped models.
-        config_data["architectures"] = architectures
+    # Update architectures with original architecture. FSDP prefix must be removed for FSDP wrapped models.
+    config_data["architectures"] = architectures
 
-        with open(config_path, "w") as f:
-            json.dump(config_data, f, indent=4)
+    with open(config_path, "w") as f:
+        json.dump(config_data, f, indent=4)
 
 
 def main(args):
@@ -402,10 +385,13 @@ def main(args):
         print(f"Quantization completed in {elapsed:.2f}s")
         mtq.print_quant_summary(model)
 
+    start_time = time.time()
     export_model(model, accelerator, args.export_path, original_architectures)
+    elapsed = time.time() - start_time
 
     if accelerator.is_main_process:
         # Export the model
+        print(f"Export completed in {elapsed:.2f}s")
         print(f"Model exported to {args.export_path}")
 
     print("Unpatching FSDP2 MP dtypes")
 
@@ -345,9 +345,7 @@ def is_moe(module: nn.Module) -> bool:
 
 def is_quantlinear(module: nn.Module) -> bool:
     """Returns whether the module is a quantized linear layer."""
-    return (
-        "QuantLinear" in type(module).__name__ and "lora" not in type(module).__name__.lower()
-    ) or ("Quant" in type(module).__name__ and "Linear" in type(module).__name__)
+    return "QuantLinear" in type(module).__name__ and "lora" not in type(module).__name__.lower()
 
 
 def dup_kv_weight(v: torch.Tensor, head_size: int, num_head: int, tp_size: int) -> torch.Tensor:
 
@@ -26,13 +26,13 @@
 
 import torch
 import torch.nn as nn
+from accelerate import Accelerator
 from safetensors.torch import save_file
 
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
-from modelopt.torch.quantization.qtensor.base_qtensor import fsdp2_aware_weight_update
-from modelopt.torch.quantization.utils import quantizer_attr_names
+from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
 
 from .convert_hf_config import convert_hf_quant_config_format
 from .layer_utils import (
@@ -344,7 +344,10 @@ def _export_quantized_weight(
 
 
 def _export_hf_checkpoint(
-    model: nn.Module, dtype: torch.dtype | None = None
+    model: nn.Module,
+    dtype: torch.dtype | None = None,
+    is_fsdp2: bool = False,
+    accelerator: Accelerator | None = None,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -490,7 +493,11 @@ def _export_hf_checkpoint(
                     with fsdp2_aware_weight_update(model, sub_module):
                         _export_quantized_weight(sub_module, dtype, weight_name)
 
-    quantized_state_dict = model.state_dict()
+    if is_fsdp2:
+        assert accelerator is not None, "Accelerator is required for FSDP2 export"
+        quantized_state_dict = accelerator.get_state_dict(model)
+    else:
+        quantized_state_dict = model.state_dict()
 
     quantized_state_dict = postprocess_state_dict(
         quantized_state_dict, kv_cache_max_bound, kv_cache_format
@@ -508,6 +515,8 @@ def export_hf_checkpoint(
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
+    is_fsdp2: bool = False,
+    accelerator: Accelerator | None = None,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -529,7 +538,9 @@ def export_hf_checkpoint(
         return
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        post_state_dict, hf_quant_config = _export_hf_checkpoint(
+            model, dtype, is_fsdp2, accelerator
+        )
 
         # Save hf_quant_config.json for backward compatibility
         with open(f"{export_dir}/hf_quant_config.json", "w") as file: