(9/n) Support 2D Parallelism - Remaining Checkpoint Logic (#19888)

awaelchli · lantiga · web-flow · commit 414c86332e74 · 2024-05-22T18:13:41.000-04:00
Co-authored-by: Luca Antiga &lt;luca.antiga@gmail.com&gt;
diff --git a/src/lightning/fabric/strategies/model_parallel.py b/src/lightning/fabric/strategies/model_parallel.py
@@ -412,6 +412,7 @@ def _load_checkpoint(
     path: Path,
     state: Dict[str, Union[Module, Optimizer, Any]],
     strict: bool = True,
+    optimizer_states_from_list: bool = False,
 ) -> Dict[str, Any]:
     from torch.distributed.checkpoint.state_dict import (
         StateDictOptions,
@@ -473,8 +474,15 @@ def _load_checkpoint(
             full_state_dict=True,
             strict=strict,
         )
-        for optimizer_name, optimizer in optimizers.items():
-            optimizer_state = _rekey_optimizer_state_if_needed(checkpoint.pop(optimizer_name), module)
+        for optimizer_idx, (optimizer_name, optimizer) in enumerate(optimizers.items()):
+            if optimizer_states_from_list:
+                # This code path is only used by `lightning.pytorch`, which saves optimizer states as a list
+                # rather than individual states at the top level.
+                optimizer_state = checkpoint["optimizer_states"][optimizer_idx]
+            else:
+                optimizer_state = checkpoint.pop(optimizer_name)
+
+            optimizer_state = _rekey_optimizer_state_if_needed(optimizer_state, module)
             set_optimizer_state_dict(
                 module,
                 optimizer,
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -18,6 +18,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added support for PyTorch 2.3 ([#19708](https://github.com/Lightning-AI/pytorch-lightning/pull/19708))
 
+- Added `ModelParallelStrategy` to support 2D parallelism ([#19878](https://github.com/Lightning-AI/pytorch-lightning/pull/19878), [#19888](https://github.com/Lightning-AI/pytorch-lightning/pull/19888))
+
+
 
 ### Changed
 
diff --git a/src/lightning/pytorch/strategies/model_parallel.py b/src/lightning/pytorch/strategies/model_parallel.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import shutil
 from contextlib import contextmanager, nullcontext
 from datetime import timedelta
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Literal, Mapping, Optional, Union
 
 import torch
@@ -22,9 +24,13 @@
 from typing_extensions import override
 
 import lightning.pytorch as pl
-from lightning.fabric.plugins import CheckpointIO
 from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
-from lightning.fabric.strategies.model_parallel import _setup_device_mesh
+from lightning.fabric.strategies.model_parallel import (
+    _distributed_checkpoint_save,
+    _is_sharded_checkpoint,
+    _load_checkpoint,
+    _setup_device_mesh,
+)
 from lightning.fabric.utilities.distributed import (
     _distributed_is_initialized,
     _get_default_process_group_backend_for_device,
@@ -34,6 +40,7 @@
 from lightning.fabric.utilities.distributed import group as _group
 from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_3
 from lightning.fabric.utilities.init import _materialize_distributed_module
+from lightning.fabric.utilities.load import _METADATA_FILENAME
 from lightning.fabric.utilities.optimizer import _optimizers_to_device
 from lightning.fabric.utilities.seed import reset_seed
 from lightning.fabric.utilities.types import _PATH, ReduceOp
@@ -95,16 +102,6 @@ def device_mesh(self) -> "DeviceMesh":
             raise RuntimeError("Accessing the device mesh before processes have initialized is not allowed.")
         return self._device_mesh
 
-    @property
-    @override
-    def checkpoint_io(self) -> CheckpointIO:
-        raise NotImplementedError(f"The `{type(self).__name__}` does not use the `CheckpointIO` plugin interface.")
-
-    @checkpoint_io.setter
-    @override
-    def checkpoint_io(self, io: CheckpointIO) -> None:
-        raise NotImplementedError(f"The `{type(self).__name__}` does not support setting a `CheckpointIO` plugin.")
-
     @property
     @override
     def root_device(self) -> torch.device:
@@ -253,6 +250,11 @@ def teardown(self) -> None:
 
     @override
     def lightning_module_state_dict(self) -> Dict[str, Any]:
+        """Collects the state dict of the model.
+
+        Only returns a non-empty state dict on rank 0 if ``save_distributed_checkpoint=False``.
+
+        """
         from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict
 
         state_dict_options = StateDictOptions(full_state_dict=(not self._save_distributed_checkpoint), cpu_offload=True)
@@ -266,6 +268,11 @@ def load_model_state_dict(self, checkpoint: Mapping[str, Any], strict: bool = Tr
 
     @override
     def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Any]:
+        """Collects the state of the given optimizer.
+
+        Only returns a non-empty state dict on rank 0 if ``save_distributed_checkpoint=False``.
+
+        """
         from torch.distributed.checkpoint.state_dict import StateDictOptions, get_optimizer_state_dict
         from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
         from torch.distributed.fsdp import OptimStateKeyType
@@ -275,8 +282,9 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Any]:
             optimizer = optimizer._optimizer
 
         assert self.model is not None
+
         state_dict = get_optimizer_state_dict(self.model, optimizer, options=state_dict_options)
-        if not self._save_distributed_checkpoint:
+        if not self._save_distributed_checkpoint and self.global_rank == 0:
             # Store the optimizer state dict in standard format
             state_dict = FSDP.rekey_optim_state_dict(state_dict, OptimStateKeyType.PARAM_ID, self.model)
         return state_dict
@@ -295,11 +303,45 @@ def save_checkpoint(
                 f"`{type(self).__name__}.save_checkpoint(..., storage_options=...)` is not supported because"
                 f" `{type(self).__name__}` does not use the `CheckpointIO`."
             )
-        raise NotImplementedError("Checkpoint saving is not yet implemented.")
+        # broadcast the path from rank 0 to ensure all the checkpoints are saved to a common path
+        path = Path(self.broadcast(filepath))
+        if path.is_dir() and not self._save_distributed_checkpoint and not _is_sharded_checkpoint(path):
+            raise IsADirectoryError(f"The checkpoint path exists and is a directory: {path}")
+
+        if self._save_distributed_checkpoint:
+            if path.is_file():
+                path.unlink()
+            path.mkdir(parents=True, exist_ok=True)
+
+            converted_state = {"state_dict": checkpoint.pop("state_dict")}
+            converted_state.update({
+                f"optimizer_{idx}": optim_state
+                for idx, optim_state in enumerate(checkpoint.pop("optimizer_states", []))
+            })
+            _distributed_checkpoint_save(converted_state, path)
+
+            if self.global_rank == 0:
+                torch.save(checkpoint, path / _METADATA_FILENAME)
+        else:
+            if _is_sharded_checkpoint(path):
+                shutil.rmtree(path)
+            return super().save_checkpoint(checkpoint=checkpoint, filepath=path)
 
     @override
     def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
-        raise NotImplementedError("Checkpoint loading is not yet implemented.")
+        # broadcast the path from rank 0 to ensure all the states are loaded from a common path
+        path = Path(self.broadcast(checkpoint_path))
+        state = {
+            "state_dict": self.model,
+            **{f"optimizer_{idx}": optimizer for idx, optimizer in enumerate(self.optimizers)},
+        }
+        assert self.lightning_module is not None
+        return _load_checkpoint(
+            path=path,
+            state=state,
+            strict=self.lightning_module.strict_loading,
+            optimizer_states_from_list=True,
+        )
 
     def _setup_distributed(self) -> None:
         super().setup_environment()
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -210,15 +210,15 @@ def test_invalid_on_cpu(tmp_path, cuda_count_0):
         trainer.strategy.setup_environment()
 
 
-def test_fsdp_custom_mixed_precision():
+def test_custom_mixed_precision():
     """Test to ensure that passing a custom mixed precision config works."""
     config = MixedPrecision()
     strategy = FSDPStrategy(mixed_precision=config)
     assert strategy.mixed_precision_config == config
 
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
-def test_fsdp_strategy_sync_batchnorm(tmp_path):
+def test_strategy_sync_batchnorm(tmp_path):
     """Test to ensure that sync_batchnorm works when using FSDP and GPU, and all stages can be run."""
     model = TestFSDPModel()
     trainer = Trainer(
@@ -234,7 +234,7 @@ def test_fsdp_strategy_sync_batchnorm(tmp_path):
 
 
 @RunIf(min_cuda_gpus=1, skip_windows=True)
-def test_fsdp_modules_without_parameters(tmp_path):
+def test_modules_without_parameters(tmp_path):
     """Test that TorchMetrics get moved to the device despite not having any parameters."""
 
     class MetricsModel(BoringModel):
@@ -266,7 +266,7 @@ def training_step(self, batch, batch_idx):
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))])
 @pytest.mark.parametrize("state_dict_type", ["sharded", "full"])
-def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmp_path):
+def test_strategy_checkpoint(state_dict_type, precision, tmp_path):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
     model = TestFSDPModel()
     strategy = FSDPStrategy(state_dict_type=state_dict_type)
@@ -286,7 +286,7 @@ def custom_auto_wrap_policy(
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000])
-def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params):
+def test_strategy_full_state_dict(tmp_path, wrap_min_params):
     """Test to ensure that the full state dict is extracted when using FSDP strategy.
 
     Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all.
@@ -342,7 +342,7 @@ def test_fsdp_strategy_full_state_dict(tmp_path, wrap_min_params):
         ),
     ],
 )
-def test_fsdp_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg):
+def test_checkpoint_multi_gpus(tmp_path, model, strategy, strategy_cfg):
     """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
     ck = ModelCheckpoint(save_last=True)
 
@@ -410,7 +410,7 @@ def configure_optimizers(self):
         trainer.fit(model)
 
 
-def test_fsdp_forbidden_precision_raises():
+def test_forbidden_precision_raises():
     with pytest.raises(TypeError, match="can only work with the `FSDPPrecision"):
         FSDPStrategy(precision_plugin=HalfPrecision())
 
@@ -419,7 +419,7 @@ def test_fsdp_forbidden_precision_raises():
         strategy.precision_plugin = HalfPrecision()
 
 
-def test_fsdp_activation_checkpointing():
+def test_activation_checkpointing():
     """Test that the FSDP strategy can apply activation checkpointing to the given layers."""
 
     class Block1(nn.Linear):
@@ -469,7 +469,7 @@ def __init__(self):
     apply_mock.assert_called_with(wrapped, checkpoint_wrapper_fn=ANY, **strategy._activation_checkpointing_kwargs)
 
 
-def test_fsdp_strategy_cpu_offload():
+def test_strategy_cpu_offload():
     """Test the different ways cpu offloading can be enabled."""
     # bool
     strategy = FSDPStrategy(cpu_offload=True)
@@ -481,7 +481,7 @@ def test_fsdp_strategy_cpu_offload():
     assert strategy.cpu_offload == config
 
 
-def test_fsdp_sharding_strategy():
+def test_sharding_strategy():
     """Test the different ways the sharding strategy can be set."""
     from torch.distributed.fsdp import ShardingStrategy
 
@@ -501,7 +501,7 @@ def test_fsdp_sharding_strategy():
 
 
 @pytest.mark.parametrize("sharding_strategy", ["HYBRID_SHARD", "_HYBRID_SHARD_ZERO2"])
-def test_fsdp_hybrid_sharding_strategy(sharding_strategy):
+def test_hybrid_sharding_strategy(sharding_strategy):
     """Test that the hybrid sharding strategies can only be used with automatic wrapping or a manually specified pg."""
     with pytest.raises(RuntimeError, match="The hybrid sharding strategy requires you to pass at least one of"):
         FSDPStrategy(sharding_strategy=sharding_strategy)
@@ -523,7 +523,7 @@ def test_fsdp_hybrid_sharding_strategy(sharding_strategy):
         FSDPStrategy(sharding_strategy=sharding_strategy, process_group=process_group, device_mesh=device_mesh)
 
 
-def test_fsdp_use_orig_params():
+def test_use_orig_params():
     """Test that Lightning enables `use_orig_params` automatically."""
     strategy = FSDPStrategy()
     assert strategy.kwargs["use_orig_params"]
@@ -548,7 +548,7 @@ def test_set_timeout(init_process_group_mock):
 
 
 @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state")
-def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path):
+def test_strategy_load_optimizer_states_multiple(_, tmp_path):
     strategy = FSDPStrategy(parallel_devices=[torch.device("cpu")], state_dict_type="full")
     trainer = Trainer()
     trainer.state.fn = TrainerFn.FITTING
@@ -572,7 +572,7 @@ def test_fsdp_strategy_load_optimizer_states_multiple(_, tmp_path):
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000])
-def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params):
+def test_strategy_save_optimizer_states(tmp_path, wrap_min_params):
     """Test to ensure that the full state dict and optimizer states is saved when using FSDP strategy.
 
     Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. If the model can
@@ -630,7 +630,7 @@ def test_fsdp_strategy_save_optimizer_states(tmp_path, wrap_min_params):
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("wrap_min_params", [2, 1024, 100000000])
-def test_fsdp_strategy_load_optimizer_states(wrap_min_params, tmp_path):
+def test_strategy_load_optimizer_states(wrap_min_params, tmp_path):
     """Test to ensure that the full state dict and optimizer states can be load when using FSDP strategy.
 
     Based on `wrap_min_params`, the model will be fully wrapped, half wrapped, and not wrapped at all. If the DDP model
@@ -741,7 +741,7 @@ def test_save_checkpoint_storage_options(tmp_path):
 @mock.patch("lightning.pytorch.strategies.fsdp._get_sharded_state_dict_context")
 @mock.patch("lightning.fabric.plugins.io.torch_io._atomic_save")
 @mock.patch("lightning.pytorch.strategies.fsdp.shutil")
-def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path):
+def test_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___, tmp_path):
     strategy = FSDPStrategy(state_dict_type="full")
 
     # state_dict_type='full', path exists, path is not a sharded checkpoint: error
@@ -757,16 +757,12 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___,
     path.mkdir()
     (path / "meta.pt").touch()
     assert _is_sharded_checkpoint(path)
-    model = Mock(spec=FullyShardedDataParallel)
-    model.modules.return_value = [model]
     strategy.save_checkpoint(Mock(), filepath=path)
     shutil_mock.rmtree.assert_called_once_with(path)
 
     # state_dict_type='full', path exists, path is a file: no error (overwrite)
     path = tmp_path / "file.pt"
     path.touch()
-    model = Mock(spec=FullyShardedDataParallel)
-    model.modules.return_value = [model]
     torch_save_mock.reset_mock()
     strategy.save_checkpoint(Mock(), filepath=path)
     torch_save_mock.assert_called_once()
@@ -783,30 +779,26 @@ def test_fsdp_save_checkpoint_path_exists(shutil_mock, torch_save_mock, __, ___,
     path = tmp_path / "not-empty-2"
     path.mkdir()
     (path / "file").touch()
-    model = Mock(spec=FullyShardedDataParallel)
-    model.modules.return_value = [model]
     with save_mock:
         strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path)
     assert (path / "file").exists()
 
     # state_dict_type='sharded', path exists, path is a file: no error (overwrite)
     path = tmp_path / "file-2.pt"
     path.touch()
-    model = Mock(spec=FullyShardedDataParallel)
-    model.modules.return_value = [model]
     with save_mock:
         strategy.save_checkpoint({"state_dict": {}, "optimizer_states": {"": {}}}, filepath=path)
     assert path.is_dir()
 
 
 @mock.patch("lightning.pytorch.strategies.fsdp.FSDPStrategy.broadcast", lambda _, x: x)
-def test_fsdp_save_checkpoint_unknown_state_dict_type(tmp_path):
+def test_save_checkpoint_unknown_state_dict_type(tmp_path):
     strategy = FSDPStrategy(state_dict_type="invalid")
     with pytest.raises(ValueError, match="Unknown state_dict_type"):
         strategy.save_checkpoint(checkpoint=Mock(), filepath=tmp_path)
 
 
-def test_fsdp_load_unknown_checkpoint_type(tmp_path):
+def test_load_unknown_checkpoint_type(tmp_path):
     """Test that the strategy validates the contents at the checkpoint path."""
     strategy = FSDPStrategy()
     strategy.model = Mock()
@@ -874,7 +866,7 @@ def test_save_load_sharded_state_dict(tmp_path):
 @mock.patch("lightning.pytorch.strategies.fsdp.torch.load")
 @mock.patch("lightning.pytorch.strategies.fsdp._lazy_load")
 @mock.patch("lightning.pytorch.strategies.fsdp._load_raw_module_state")
-def test_fsdp_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_path):
+def test_lazy_load_full_state_dict(_, lazy_load_mock, torch_load_mock, tmp_path):
     """Test that loading a single file (full state) is lazy to reduce peak CPU memory usage."""
     model = BoringModel()
     checkpoint = {"state_dict": model.state_dict()}
diff --git a/tests/tests_pytorch/strategies/test_model_parallel.py b/tests/tests_pytorch/strategies/test_model_parallel.py
diff --git a/tests/tests_pytorch/strategies/test_model_parallel_integration.py b/tests/tests_pytorch/strategies/test_model_parallel_integration.py