Restored FSDP2 sharded state_dict support

realAsma · realAsma · commit 53a4430a1899 · 2025-09-16T13:53:15.000Z
Signed-off-by: realAsma &lt;akuriparambi@nvidia.com&gt;
diff --git a/examples/llm_qat/convert_sharded_ckpt.py b/examples/llm_qat/convert_sharded_ckpt.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from transformers import AutoModelForCausalLM
+
+from modelopt.torch.quantization.plugins.transformers_trainer import (
+    convert_sharded_model_to_hf_format,
+)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert sharded checkpoint to HuggingFace format")
+    parser.add_argument(
+        "--hf_model_path", type=str, required=True, help="Path to the original HuggingFace model"
+    )
+    parser.add_argument(
+        "--sharded_ckpt_path",
+        type=str,
+        required=True,
+        help="Path to the sharded checkpoint directory",
+    )
+    parser.add_argument(
+        "--output_path", type=str, default="", help="Output path to save the converted model"
+    )
+
+    args = parser.parse_args()
+
+    model = AutoModelForCausalLM.from_pretrained(args.hf_model_path)
+    if os.path.exists(os.path.join(args.sharded_ckpt_path, "pytorch_model_fsdp_0")):
+        convert_sharded_model_to_hf_format(
+            model, args.sharded_ckpt_path, "modelopt_state_train.pth", args.output_path
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/llm_qat/launch.sh b/examples/llm_qat/launch.sh
@@ -180,4 +180,5 @@ CMD="accelerate launch --config-file accelerate_config/$CONFIG_FILE $FSDP_ARGS \
 
 start_time=$(date +%s)
 sh -c "$CMD"
-echo "Total time taken: $(( $(date +%s) - $start_time )) seconds"
+echo "Total time taken: $(( $(date +%s) - $start_time )) seconds"
+python convert_sharded_ckpt.py --hf_model_path $MODEL --sharded_ckpt_path $OUTPUT_DIR --output_path $OUTPUT_DIR
diff --git a/examples/llm_qat/utils.py b/examples/llm_qat/utils.py
@@ -171,5 +171,6 @@ def new_func(original_f_name, trainer, *args, **kwargs):
 
 def get_metrics_with_perplexity(metrics):
     """Add perplexity to the metrics."""
-    metrics = {"perplexity": float(torch.exp(torch.tensor(metrics["eval_loss"]))), **metrics}
+    if "eval_loss" in metrics:
+        metrics["perplexity"] = float(torch.exp(torch.tensor(metrics["eval_loss"])))
     return metrics
diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py
@@ -21,6 +21,8 @@
 from dataclasses import dataclass, field
 
 import torch
+import torch.distributed.checkpoint as dist_cp
+from accelerate.utils import save_fsdp_model
 from tqdm import tqdm
 
 import modelopt.torch.opt as mto
@@ -114,6 +116,15 @@ def check_awq_smoothquant(quant_cfg):
     return is_awq_smoothquant
 
 
+def restore_modelopt_state_with_weights(model, modelopt_state_path):
+    """Restore the modelopt weights for fsdp2 models."""
+    _modelopt_state = torch.load(modelopt_state_path, weights_only=False)
+    modelopt_weights = _modelopt_state.pop("modelopt_state_weights", None)
+    restore_from_modelopt_state(model, _modelopt_state)
+    if modelopt_weights is not None:
+        set_quantizer_state_dict(model, modelopt_weights)
+
+
 class QATTrainer(ModelOptHFTrainer):
     """A drop-in replacement of HuggingFace's Trainer for quantization aware training with ModelOpt.
 
@@ -165,10 +176,12 @@ def __init__(
         self._modelopt_state_path = os.path.join(self.args.output_dir, "modelopt_state_train.pth")
         if os.path.exists(self._modelopt_state_path):
             self._restore_modelopt_state_with_weights()
-            print_rank_0("Restored modelopt state with weights.")
+        elif is_quantized(self.model):
+            self._save_modelopt_state_with_weights()
 
     def _save_modelopt_state_with_weights(self):
         """Save the modelopt weights for fsdp2 models."""
+        print_rank_0(f"Saving modelopt state to {self._modelopt_state_path}")
         if torch.distributed.is_initialized():
             torch.distributed.barrier()
 
@@ -179,18 +192,13 @@ def _save_modelopt_state_with_weights(self):
             for state in modelopt_state["modelopt_state_dict"]
             if "kd_loss" not in state and "export_student" not in state
         ]
-        modelopt_full_state = {
-            "modelopt_state": modelopt_state,
-            "modelopt_state_weights": get_quantizer_state_dict(self.model),
-        }
-
+        modelopt_state["modelopt_state_weights"] = get_quantizer_state_dict(self.model)
         if self.args.should_save:
-            torch.save(modelopt_full_state, self._modelopt_state_path)
+            torch.save(modelopt_state, self._modelopt_state_path)
 
     def _restore_modelopt_state_with_weights(self):
-        modelopt_full_state = torch.load(self._modelopt_state_path, weights_only=False)
-        restore_from_modelopt_state(self.model, modelopt_full_state["modelopt_state"])
-        set_quantizer_state_dict(self.model, modelopt_full_state["modelopt_state_weights"])
+        restore_modelopt_state_with_weights(self.model, self._modelopt_state_path)
+        print_rank_0("Restored modelopt state with weights.")
 
     def _quantize_model(self):
         """Quantize the model. Restore the quantization state if it exists."""
@@ -219,7 +227,6 @@ def forward_loop(model):
         # Force garbage collection to free up memory
         gc.collect()
 
-        print_rank_0(f"Saving modelopt state to {self._modelopt_state_path}")
         self._save_modelopt_state_with_weights()
         torch.cuda.empty_cache()
 
@@ -247,17 +254,29 @@ def evaluate(self, *args, **kwargs):
             self.model, _ = self.accelerator.prepare(self.model, dummy_optimizer)
         return super().evaluate(*args, **kwargs)
 
-    def save_model(self, *args, **kwargs):
+    def save_model(
+        self, output_dir: str | None = None, _internal_call: bool = False, *args, **kwargs
+    ):
         """Save the quantized model."""
-        if (
-            (not self.is_in_train)
-            and self.is_fsdp_enabled
-            and self.accelerator.state.fsdp_plugin.state_dict_type != "FULL_STATE_DICT"
-        ):
-            print_rank_0("Setting state_dict_type to FULL_STATE_DICT for final checkpoint save.")
-            # TODO: test is this fix works for multi-node training
-            self.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
-        return super().save_model(*args, **kwargs)
+        dict_type = (
+            str(self.accelerator.state.fsdp_plugin.state_dict_type) if self.is_fsdp_enabled else ""
+        )
+        if not _internal_call and self.is_fsdp_enabled and "SHARDED_STATE_DICT" in dict_type:
+            # The default save_model in Trainer doesn't save checkpoint with SHARDED_STATE_DICT + FSDP.
+            # We save the model manually at the end of the training in order to convert the last
+            # checkpoint from distcp to HF compatible format.
+            if output_dir is None:
+                output_dir = self.args.output_dir
+            save_fsdp_model(
+                self.accelerator.state.fsdp_plugin,
+                self.accelerator,
+                self.model,
+                output_dir,
+            )
+            self.processing_class.save_pretrained(output_dir)
+            self.model.config.save_pretrained(output_dir)
+        else:
+            super().save_model(output_dir, _internal_call, *args, **kwargs)
 
     def _patch_accelerate_for_fsdp2_fix(self):
         """Fixes for accelerate prepare.
@@ -360,3 +379,37 @@ def save_model(
         return KDTrainer.save_model(
             self, output_dir, _internal_call, export_student, *args, **kwargs
         )
+
+
+def convert_sharded_model_to_hf_format(
+    model, model_path, modelopt_state_name="modelopt_state.pth", output_path=None
+):
+    """Convert a sharded model to HF format.
+
+    Args:
+        model: The original HF model.
+        model_path: The path to the sharded model with pytorch_model_fsdp_0 directory.
+        modelopt_state_name: The name of the modelopt state file. If not provided, the default name
+            "modelopt_state.pth" will be used.
+        output_path: The path to save the converted model. If not provided, the model will be saved
+            to the same directory as the sharded model.
+    """
+    if output_path is None:
+        output_path = model_path
+    os.makedirs(output_path, exist_ok=True)
+    state_dict = {"model": model.state_dict()}
+    sharded_model_path = os.path.join(model_path, "pytorch_model_fsdp_0")
+    modelopt_state_path = os.path.join(model_path, modelopt_state_name)
+    if not os.path.exists(sharded_model_path):
+        print_rank_0(f"Sharded model path does not exist: {sharded_model_path}")
+        return model
+    dist_cp.load_state_dict(
+        state_dict=state_dict,
+        storage_reader=dist_cp.FileSystemReader(sharded_model_path),
+        no_dist=True,
+    )
+    model.load_state_dict(state_dict["model"])
+    restore_modelopt_state_with_weights(model, modelopt_state_path)
+    mto.enable_huggingface_checkpointing()
+    model.save_pretrained(output_path)
+    return model