Fix state dict loading for ddp/dp in Fabric (#17997)

awaelchli · lantiga · commit 23495cce732a · 2023-07-10T17:06:44.000+02:00
* fix state dict loading for ddp/dp * test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * changelog * update test * move params to same device before equality test * test strategy --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> (cherry picked from commit b14ddd9)
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -34,6 +34,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed an issue causing the `torch.set_float32_matmul_precision` info message to show multiple times ([#17960](https://github.com/Lightning-AI/lightning/pull/17960))
 
 
+- Fixed loading model state when `Fabric.load()` is called after `Fabric.setup()` ([#17997](https://github.com/Lightning-AI/lightning/pull/17997))
+
+
 ## [2.0.3] - 2023-06-07
 
 - Added support for `Callback` registration through entry points ([#17756](https://github.com/Lightning-AI/lightning/pull/17756))
diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
@@ -160,6 +160,13 @@ def get_module_state_dict(self, module: Module) -> Dict[str, Union[Any, Tensor]]
             module = module.module
         return super().get_module_state_dict(module)
 
+    def load_module_state_dict(
+        self, module: Module, state_dict: Dict[str, Union[Any, Tensor]], strict: bool = True
+    ) -> None:
+        if isinstance(module, DistributedDataParallel):
+            module = module.module
+        super().load_module_state_dict(module=module, state_dict=state_dict, strict=strict)
+
     @classmethod
     def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None:
         entries = (
diff --git a/src/lightning/fabric/strategies/dp.py b/src/lightning/fabric/strategies/dp.py
@@ -89,6 +89,13 @@ def get_module_state_dict(self, module: Module) -> Dict[str, Union[Any, Tensor]]
             module = module.module
         return super().get_module_state_dict(module)
 
+    def load_module_state_dict(
+        self, module: Module, state_dict: Dict[str, Union[Any, Tensor]], strict: bool = True
+    ) -> None:
+        if isinstance(module, DataParallel):
+            module = module.module
+        super().load_module_state_dict(module=module, state_dict=state_dict, strict=strict)
+
     @classmethod
     def register_strategies(cls, strategy_registry: _StrategyRegistry) -> None:
         strategy_registry.register("dp", cls, description=cls.__class__.__name__)
diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py
@@ -234,6 +234,12 @@ def get_module_state_dict(self, module: Module) -> Dict[str, Union[Any, Tensor]]
         """Returns model state."""
         return module.state_dict()
 
+    def load_module_state_dict(
+        self, module: Module, state_dict: Dict[str, Union[Any, Tensor]], strict: bool = True
+    ) -> None:
+        """Loads the given state into the model."""
+        module.load_state_dict(state_dict, strict=strict)
+
     def get_optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]:
         """Returns state of an optimizer.
 
@@ -279,8 +285,7 @@ def load_checkpoint(
                 continue
             if isinstance(obj, _Stateful):
                 if isinstance(obj, Module):
-                    # TODO(fabric): Make strict loading configurable
-                    obj.load_state_dict(checkpoint.pop(name), strict=True)
+                    self.load_module_state_dict(module=obj, state_dict=checkpoint.pop(name), strict=True)
                 else:
                     obj.load_state_dict(checkpoint.pop(name))
             else:
diff --git a/tests/tests_fabric/strategies/test_ddp.py b/tests/tests_fabric/strategies/test_ddp.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from copy import deepcopy
 from unittest import mock
 from unittest.mock import MagicMock, Mock
 
@@ -83,7 +84,7 @@ def test_ddp_extra_kwargs(ddp_mock):
 
 
 def test_ddp_module_state_dict():
-    """Test that the module state dict gets retrieved without the prefixed wrapper keys from DDP."""
+    """Test that the module state dict can be retrieved and loaded without the prefixed wrapper keys from DDP."""
 
     class DistributedDataParallelMock(MagicMock):
         def __instancecheck__(self, instance):
@@ -94,12 +95,18 @@ def __instancecheck__(self, instance):
 
     # Without DDP applied (no setup call)
     original_module = torch.nn.Linear(2, 3)
-    assert strategy.get_module_state_dict(original_module).keys() == original_module.state_dict().keys()
+    original_state_dict = deepcopy(original_module.state_dict())
+    retrieved_state_dict = strategy.get_module_state_dict(original_module)
+    assert retrieved_state_dict.keys() == original_state_dict.keys()
+    strategy.load_module_state_dict(original_module, retrieved_state_dict)
 
     # With DDP applied (setup called)
     with mock.patch("lightning.fabric.strategies.ddp.DistributedDataParallel", DistributedDataParallelMock):
         wrapped_module = strategy.setup_module(original_module)
-        assert strategy.get_module_state_dict(wrapped_module).keys() == original_module.state_dict().keys()
+        retrieved_state_dict = strategy.get_module_state_dict(wrapped_module)
+    assert retrieved_state_dict.keys() == original_state_dict.keys()
+    strategy.load_module_state_dict(wrapped_module, retrieved_state_dict)
+    strategy.load_module_state_dict(wrapped_module, original_state_dict)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/tests_fabric/strategies/test_ddp_integration.py b/tests/tests_fabric/strategies/test_ddp_integration.py
@@ -0,0 +1,63 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+
+import pytest
+import torch
+
+from lightning.fabric import Fabric
+from tests_fabric.helpers.runif import RunIf
+
+
+@pytest.mark.parametrize(
+    "accelerator",
+    [
+        "cpu",
+        pytest.param("cuda", marks=RunIf(min_cuda_gpus=2)),
+    ],
+)
+def test_ddp_save_load(accelerator, tmp_path):
+    """Test that DDP model checkpoints can be saved and loaded successfully."""
+    fabric = Fabric(devices=2, accelerator=accelerator, strategy="ddp_spawn")
+    fabric.launch(_run_ddp_save_load, tmp_path)
+
+
+def _run_ddp_save_load(fabric, tmp_path):
+    fabric.seed_everything(0)
+
+    tmp_path = fabric.broadcast(tmp_path)
+
+    model = torch.nn.Linear(2, 2)
+    params_before = deepcopy(list(model.parameters()))
+
+    # Save
+    fabric.save(tmp_path / "saved_before_setup.ckpt", {"model": model})
+    wrapped_model = fabric.setup(model)
+    fabric.save(tmp_path / "saved_after_setup.ckpt", {"model": wrapped_model})
+
+    def assert_params_equal(params0, params1):
+        assert all(torch.equal(p0, p1.to(p0.device)) for p0, p1 in zip(params0, params1))
+
+    # Load
+    model = torch.nn.Linear(2, 2)
+    fabric.load(tmp_path / "saved_before_setup.ckpt", {"model": model})
+    assert_params_equal(params_before, model.parameters())
+    fabric.load(tmp_path / "saved_after_setup.ckpt", {"model": model})
+    assert_params_equal(params_before, model.parameters())
+
+    wrapped_model = fabric.setup(model)
+    fabric.load(tmp_path / "saved_before_setup.ckpt", {"model": wrapped_model})
+    assert_params_equal(params_before, wrapped_model.parameters())
+    fabric.load(tmp_path / "saved_after_setup.ckpt", {"model": wrapped_model})
+    assert_params_equal(params_before, wrapped_model.parameters())
diff --git a/tests/tests_fabric/strategies/test_strategy.py b/tests/tests_fabric/strategies/test_strategy.py
@@ -66,6 +66,17 @@ def test_save_checkpoint_convert_stateful_objects(tmp_path):
     assert save_checkpoint_mock.call_args[1]["checkpoint"]["anything"] == expected["anything"]
 
 
+def test_load_module_state_dict():
+    """Test that `Strategy.load_module_state_dict()` calls `.load_state_dict()` on the module."""
+    strategy = SingleDeviceStrategy()  # surrogate class to test implementation in base class
+    module = Mock()
+    state_dict = Mock()
+    strategy.load_module_state_dict(module, state_dict)
+    module.load_state_dict.assert_called_with(state_dict, strict=True)
+    strategy.load_module_state_dict(module, state_dict, strict=False)
+    module.load_state_dict.assert_called_with(state_dict, strict=False)
+
+
 def test_load_checkpoint_out_of_place(tmp_path):
     """Test that one can load the full checkpoint into memory just like `torch.load()`."""
     strategy = SingleDeviceStrategy()  # surrogate class to test implementation in base class