remove ckpt_save_pre_mcore_014 support (#15146)

dimapihtar · web-flow · commit 77026c36a3ec · 2025-12-15T10:52:39.000-08:00
* remove ckpt_save_pre_mcore_014 param

Signed-off-by: dimapihtar &lt;dpihtar@gmail.com&gt;

* remove imports

Signed-off-by: dimapihtar &lt;dpihtar@gmail.com&gt;

---------

Signed-off-by: dimapihtar &lt;dpihtar@gmail.com&gt;
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -58,7 +58,6 @@
     from megatron.core.dist_checkpointing.validation import StrictHandling
     from megatron.core.distributed import DistributedDataParallelConfig
     from megatron.core.optimizer import OptimizerConfig
-    from megatron.core.utils import get_torch_version, is_torch_min_version
 
     HAVE_MEGATRON_CORE = True
 except (ImportError, ModuleNotFoundError):
@@ -216,11 +215,6 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
             If not None, overwrites the `strict` flag passed to `load_checkpoint`.
             Defaults to None. For a list of supported values, refer to the Megatron Core documentation:
             https://github.com/NVIDIA/Megatron-LM/blob/d4e72c0d33edc0c53aeb624f617eb77cebce6ae9/megatron/core/dist_checkpointing/validation.py#L46
-        ckpt_save_pre_mcore_014 (bool, optional): if True, brings back sharded state dict definition from
-            before Megatron-Core v0.14 versions for checkpoint saving. It doesn't affect loading as the
-            loading format is determined based on metadata stored in the checkpoint. This flag  is provided
-            temporarily as a fallback to previous behavior in case of unexpected issues with the new formats.
-            Defaults to False.
         ckpt_optim_fully_reshardable (bool, optional): switches to a fully reshardable (TP/PP/DP/EP)
             optimizer format. Defaults to False, in which case a DP-only reshardable format is used.
         distrib_optim_fully_reshardable_mem_efficient (bool, optional): minimizes CUDA and host memory
@@ -301,7 +295,6 @@ def __init__(
         ckpt_parallel_save_optim: Optional[bool] = None,
         ckpt_load_directly_on_device: bool = True,
         ckpt_load_strictness: Optional['StrictHandling'] = None,
-        ckpt_save_pre_mcore_014: bool = False,
         ckpt_optim_fully_reshardable: bool = False,
         distrib_optim_fully_reshardable_mem_efficient: bool = False,
         setup_optimizers: bool = True,
@@ -352,7 +345,6 @@ def __init__(
         self.ckpt_save_optimizer = ckpt_save_optimizer
         self.ckpt_load_main_params = ckpt_load_main_params
         self.ckpt_load_strictness = ckpt_load_strictness
-        self.ckpt_save_pre_mcore_014 = ckpt_save_pre_mcore_014
         self.ckpt_optim_fully_reshardable = ckpt_optim_fully_reshardable
         self.distrib_optim_fully_reshardable_mem_efficient = distrib_optim_fully_reshardable_mem_efficient
         self.use_te_rng_tracker = use_te_rng_tracker
@@ -442,11 +434,10 @@ def __init__(
         if self.ckpt_load_optimizer and self.ckpt_load_main_params:
             raise ValueError("ckpt_load_optimizer and ckpt_load_main_params cannot be both set to True.")
 
-        if self.parallel_save_optim is not None and not self.ckpt_save_pre_mcore_014:
+        if self.parallel_save_optim is not None:
             logging.warning(
                 "`ckpt_parallel_save_optim` argument is replaced with"
                 " `ckpt_optim_fully_reshardable` and does not have any effect"
-                " (unless used together with `ckpt_save_pre_mcore_014=True`)"
             )
 
         if isinstance(self.ddp_config, DistributedDataParallelConfig):
@@ -1228,28 +1219,14 @@ def sharded_state_dict_metadata(self):
         if use_distributed_optimizer and use_megatron_fsdp:
             metadata["distrib_optim_sharding_type"] = "fsdp_dtensor"
 
-        force_pre_mcore_014 = not is_torch_min_version("2.6a0")
-        if force_pre_mcore_014:
-            logging.warning(
-                f"PyTorch version {get_torch_version()} below 2.6 detected."
-                f" Forcing ckpt_save_pre_mcore_014 behavior."
-            )
-
-        if self.ckpt_save_pre_mcore_014 or force_pre_mcore_014:
-            if use_distributed_optimizer and not use_megatron_fsdp:
-                if self.parallel_save_optim:
-                    metadata["distrib_optim_sharding_type"] = "fully_sharded_model_space"
-                else:
-                    metadata["distrib_optim_sharding_type"] = "dp_zero_gather_scatter"
-        else:
-            if use_distributed_optimizer and not use_megatron_fsdp:
-                if self.ckpt_optim_fully_reshardable:
-                    metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
-                    metadata['distrib_optim_fully_reshardable_mem_efficient'] = (
-                        self.distrib_optim_fully_reshardable_mem_efficient
-                    )
-                else:
-                    metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
+        if use_distributed_optimizer and not use_megatron_fsdp:
+            if self.ckpt_optim_fully_reshardable:
+                metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
+                metadata['distrib_optim_fully_reshardable_mem_efficient'] = (
+                    self.distrib_optim_fully_reshardable_mem_efficient
+                )
+            else:
+                metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
         return metadata
 
     def selective_restore(self) -> None:
diff --git a/tests/lightning/pytorch/strategies/test_megatron_strategy.py b/tests/lightning/pytorch/strategies/test_megatron_strategy.py
@@ -22,25 +22,18 @@
 
 
 def get_metadata(
-    ckpt_save_pre_mcore_014: bool = None,
     ckpt_parallel_save_optim: bool = None,
     ckpt_optim_fully_reshardable: bool = None,
 ) -> dict:
     metadata = {
         'singleton_local_shards': False,
         'chained_optim_avoid_prefix': True,
     }
-    if ckpt_save_pre_mcore_014:
-        if ckpt_parallel_save_optim:
-            metadata['distrib_optim_sharding_type'] = 'fully_sharded_model_space'
-        else:
-            metadata['distrib_optim_sharding_type'] = 'dp_zero_gather_scatter'
+    if ckpt_optim_fully_reshardable:
+        metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
+        metadata['distrib_optim_fully_reshardable_mem_efficient'] = False
     else:
-        if ckpt_optim_fully_reshardable:
-            metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
-            metadata['distrib_optim_fully_reshardable_mem_efficient'] = False
-        else:
-            metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
+        metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
 
     return metadata
 
@@ -95,18 +88,10 @@ def test_ckpt_load_main_params_without_state_dict(self):
         strategy.optimizers[0].reload_model_params.assert_called_once_with(checkpoint)
 
     def test_sharded_state_dict_metadata(self):
-        strategy = MegatronStrategy(ckpt_save_pre_mcore_014=False, ckpt_parallel_save_optim=True)
+        strategy = MegatronStrategy(ckpt_parallel_save_optim=True)
 
         ddp = DistributedDataParallelConfig(use_distributed_optimizer=True)
 
-        strategy = MegatronStrategy(ckpt_save_pre_mcore_014=True, ckpt_parallel_save_optim=True, ddp=ddp)
-        metadata = strategy.sharded_state_dict_metadata
-        assert metadata == get_metadata(ckpt_save_pre_mcore_014=True, ckpt_parallel_save_optim=True)
-
-        strategy = MegatronStrategy(ckpt_save_pre_mcore_014=True, ddp=ddp)
-        metadata = strategy.sharded_state_dict_metadata
-        assert metadata == get_metadata(ckpt_save_pre_mcore_014=True)
-
         strategy = MegatronStrategy(ckpt_optim_fully_reshardable=True, ddp=ddp)
         metadata = strategy.sharded_state_dict_metadata
         assert metadata == get_metadata(ckpt_optim_fully_reshardable=True)