added optimization for export and extra note on performance

sugunav14 · sugunav14 · commit 417f17a5b9c4 · 2025-10-16T06:07:38.000Z
Signed-off-by: Suguna Velury &lt;178320438+sugunav14@users.noreply.github.com&gt;
diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md
@@ -265,7 +265,7 @@ accelerate launch --config_file fsdp2.yaml \
 
 The exported checkpoint can be deployed using TensorRT-LLM/ vLLM/ SGLang. For more details refer to the [deployment section](#deployment) of this document.
 
-> *Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory.*
+> *Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory and choose the right number of GPUs to avoid unnecessary communication.*
 >
 ## Framework Scripts
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -32,6 +32,7 @@
 except ImportError:  # pragma: no cover
     Accelerator = None
 from safetensors.torch import save_file
+from torch.distributed.fsdp import FSDPModule
 
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
@@ -350,7 +351,7 @@ def _export_quantized_weight(
 def _export_hf_checkpoint(
     model: nn.Module,
     dtype: torch.dtype | None = None,
-    accelerator: Accelerator | None = None,
+    **kwargs,
 ) -> tuple[dict[str, Any], dict[str, Any]]:
     """Exports the torch model to the packed checkpoint with original HF naming.
 
@@ -373,6 +374,8 @@ def _export_hf_checkpoint(
             f"({dtype}), which may lead to numerical errors."
         )
 
+    accelerator = kwargs.get("accelerator")
+
     # Create a model layer pool
     # If `model.model` exists use that, otherwise use `model` itself, e.g., Nemotron-H
     root = getattr(model, "model", model)
@@ -470,12 +473,21 @@ def _export_hf_checkpoint(
 
     # Track if any layers are quantized to properly set exclude_modules
     has_quantized_layers = False
+    fsdp_module_to_reshard = None
 
     for name, sub_module in layer_pool.items():
+        # Optimization to perform resharding only once per decoder layer to avoid extra communication overhead
+        if isinstance(sub_module, FSDPModule):
+            # Every time we encounter a new FSDPModule, we need to reshard the previous one
+            if fsdp_module_to_reshard is not None:
+                fsdp_module_to_reshard.reshard()
+
+            fsdp_module_to_reshard = sub_module
+
         if get_quantization_format(sub_module) != QUANTIZATION_NONE:
             has_quantized_layers = True
             if is_quantlinear(sub_module):
-                with fsdp2_aware_weight_update(model, sub_module):
+                with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                     _export_quantized_weight(sub_module, dtype)
             elif (
                 "Llama4TextExperts" in type(sub_module).__name__
@@ -494,7 +506,7 @@ def _export_hf_checkpoint(
                 )
                 # Export the quantized weights
                 for weight_name in ["gate_up_proj", "down_proj"]:
-                    with fsdp2_aware_weight_update(model, sub_module):
+                    with fsdp2_aware_weight_update(model, sub_module, reshard=False):
                         _export_quantized_weight(sub_module, dtype, weight_name)
 
     if accelerator is not None:
diff --git a/modelopt/torch/quantization/utils.py b/modelopt/torch/quantization/utils.py
@@ -593,7 +593,7 @@ def enable_fake_quant(module):
 
 
 @contextmanager
-def fsdp2_aware_weight_update(root_model, modules_to_update):
+def fsdp2_aware_weight_update(root_model, modules_to_update, reshard=True):
     """Context manager to update the FSDPParam list if an update is made to a submodule of an FSDPModule."""
     try:
         if isinstance(root_model, FSDPModule):
@@ -672,5 +672,5 @@ def fsdp2_aware_weight_update(root_model, modules_to_update):
             fsdp_param_group.fsdp_params = list(fsdp_param_mapping.values())
 
             # Reshard FSDP root module
-            # TODO: Add a check to reshard only if necessary, can help performance during export
-            root_module.reshard()
+            if reshard:
+                root_module.reshard()

Original file line number	Diff line number	Diff line change
`@@ -265,7 +265,7 @@ accelerate launch --config_file fsdp2.yaml \`
`265`	`265`
`266`	`266`	`The exported checkpoint can be deployed using TensorRT-LLM/ vLLM/ SGLang. For more details refer to the [deployment section](#deployment) of this document.`
`267`	`267`
`268`		`-> Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory.`
	`268`	`+> Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory and choose the right number of GPUs to avoid unnecessary communication.`
`269`	`269`	`>`
`270`	`270`	`## Framework Scripts`
`271`	`271`