Fix megatron distributed checkpoint metadata pass through (#431)

ChenhanYu · kevalmorabia97 · web-flow · commit 8c6b91574c7c · 2025-10-17T12:08:33.000-07:00
Signed-off-by: Chenhan Yu &lt;chenhany@nvidia.com&gt;
Co-authored-by: Keval Morabia &lt;28916987+kevalmorabia97@users.noreply.github.com&gt;
diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py
@@ -177,6 +177,7 @@ def _load_extra_state_from_sharded_checkpoint(
     model: torch.nn.Module,
     checkpoint_name: str | Path,
     prefix: str,
+    metadata: dict[str, Any] | None = None,
 ) -> None:
     """Load extra state from sharded checkpoint.
 
@@ -187,6 +188,12 @@ def _load_extra_state_from_sharded_checkpoint(
         model: the model to load extra state into
         checkpoint_name: the checkpoint folder path
         prefix: the prefix to add to the modelopt_state keys
+        metadata: the metadata for distributed checkpointing
+
+    Note:
+        The metadata includes several breaking changes. For example, `singleton_local_shards`
+        is set to `True` (was not set before) in megatron-core-0.15.0. This flag affects the
+        sharded state_dict format and must be consistent between saving and loading.
     """
     sharded_state_dict = model.sharded_state_dict(prefix=prefix)
     extra_sharded_state_dict = {k: v for k, v in sharded_state_dict.items() if "_extra_state" in k}
@@ -208,13 +215,20 @@ def restore_sharded_modelopt_state(
     model: list[torch.nn.Module],
     checkpoint_name: str | Path,
     prefix: str = "",
+    metadata: dict[str, Any] | None = None,
 ) -> None:
     """Restore modelopt_state from the sharded state_dict format.
 
     Args:
         model: the model to restore the modelopt optimization
         checkpoint_name: the checkpoint folder path
         prefix: the prefix to add to the modelopt_state keys ("model." for NeMo)
+        metadata: the metadata for distributed checkpointing
+
+    Note:
+        The metadata includes several breaking changes. For example, `singleton_local_shards`
+        is set to `True` (was not set before) in megatron-core-0.15.0. This flag affects the
+        sharded state_dict format and must be consistent between saving and loading.
     """
     if len(model) > 1:
         raise ValueError("sharded_modelopt_state does not support virtual pipeline parallel!")
@@ -247,4 +261,4 @@ def restore_sharded_modelopt_state(
     #
     model[0] = mto.restore_from_modelopt_state(model[0], common_modelopt_state)
 
-    _load_extra_state_from_sharded_checkpoint(model[0], checkpoint_name, prefix)
+    _load_extra_state_from_sharded_checkpoint(model[0], checkpoint_name, prefix, metadata=metadata)