Fix initialized weights resetting in Fabric.setup() when using FSDP (#19755)

awaelchli · lantiga · commit 524a5b1f604a · 2024-04-11T09:09:21.000-04:00
diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
+## [2.2.2] - 2024-04-11
+
+### Fixed
+
+- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524))
+- Fixed an issue causing a TypeError when using `torch.compile` as a decorator ([#19627](https://github.com/Lightning-AI/pytorch-lightning/pull/19627))
+- Fixed issue where some model methods couldn't be monkeypatched after being Fabric wrapped ([#19705](https://github.com/Lightning-AI/pytorch-lightning/pull/19705))
+- Fixed an issue causing weights to be reset in `Fabric.setup()` when using FSDP ([#19755](https://github.com/Lightning-AI/pytorch-lightning/pull/19755))
+
 ## [2.2.1] - 2024-03-04
 
 ### Fixed
diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py
@@ -109,14 +109,12 @@ def half(self) -> Self:
 def _update_properties(
     root: torch.nn.Module, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None
 ) -> None:
-    def apply_fn(module: Union[_DeviceDtypeModuleMixin, Module]) -> None:
+    for module in root.modules():
         if not isinstance(module, _DeviceDtypeModuleMixin):
-            return
+            continue
         # cannot use `module.to()` because we don't actually want to move the model in case there are multiple
         # devices types (such as partial meta parameters)
         if device is not None:
             module._device = device
         if dtype is not None:
             module._dtype = dtype
-
-    root.apply(apply_fn)
diff --git a/tests/tests_fabric/strategies/test_fsdp_integration.py b/tests/tests_fabric/strategies/test_fsdp_integration.py
@@ -667,3 +667,22 @@ def test_save_sharded_and_consolidate_and_load(tmp_path):
     model, optimizer = fabric.setup(model, optimizer)
     state = {"model": model, "optimizer": optimizer, "steps": 1}
     fabric.load(checkpoint_path_full, state)
+
+
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_no_call_to_apply(monkeypatch):
+    """Regression test to ensure we're not calling `FSDP.apply()` indirectly (see #19755)."""
+    monkeypatch.setattr(torch.distributed.fsdp.FullyShardedDataParallel, "apply", Mock())
+
+    fabric = Fabric(
+        accelerator="cuda",
+        strategy=FSDPStrategy(auto_wrap_policy=always_wrap_policy),
+        devices=2,
+    )
+    fabric.launch()
+
+    for setup_method in ("setup", "setup_module"):
+        model = BoringModel()
+        setup = getattr(fabric, setup_method)
+        model = setup(model)
+        model._forward_module.apply.assert_not_called()