Sharded state dicts save correctly when save_weights_only=True (#19524)

dimitri-voytan · dvoytan-spark · awaelchli · web-flow · commit b3275e05d1e6 · 2024-03-12T21:53:31.000-04:00
Co-authored-by: Dimitri &lt;dvoytan@sparkcognition.com&gt;
Co-authored-by: awaelchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md
@@ -42,7 +42,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
--
+- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524))
+
 
 -
 
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
@@ -568,7 +568,8 @@ def save_checkpoint(
 
             converted_state = {"model": checkpoint.pop("state_dict")}
             converted_state.update({
-                f"optimizer_{idx}": optim_state for idx, optim_state in enumerate(checkpoint.pop("optimizer_states"))
+                f"optimizer_{idx}": optim_state
+                for idx, optim_state in enumerate(checkpoint.pop("optimizer_states", []))
             })
 
             _distributed_checkpoint_save(converted_state, path)
diff --git a/tests/tests_pytorch/strategies/test_fsdp.py b/tests/tests_pytorch/strategies/test_fsdp.py
@@ -185,7 +185,8 @@ def _run_multiple_stages(trainer, model, model_path: Optional[str] = None):
     trainer.save_checkpoint(model_path.with_name("after-test"))
     trainer.save_checkpoint(model_path, weights_only=True)
 
-    _assert_save_equality(trainer, model_path, cls=model.__class__)
+    if not model_path.is_dir():  # TODO (@awaelchli): Add support for asserting equality of sharded checkpoints
+        _assert_save_equality(trainer, model_path, cls=model.__class__)
 
     with torch.inference_mode():
         # Test entry point
@@ -279,11 +280,13 @@ def training_step(self, batch, batch_idx):
 
 @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
 @pytest.mark.parametrize("precision", ["16-mixed", pytest.param("bf16-mixed", marks=RunIf(bf16_cuda=True))])
-def test_fsdp_strategy_checkpoint(tmpdir, precision):
+@pytest.mark.parametrize("state_dict_type", ["sharded", "full"])
+def test_fsdp_strategy_checkpoint(state_dict_type, precision, tmpdir):
     """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
     model = TestFSDPModel()
+    strategy = FSDPStrategy(state_dict_type=state_dict_type)
     trainer = Trainer(
-        default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy="fsdp", precision=precision, max_epochs=1
+        default_root_dir=tmpdir, accelerator="gpu", devices=2, strategy=strategy, precision=precision, max_epochs=1
     )
     _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
 

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`42`	`42`
`43`	`43`	`### Fixed`
`44`	`44`
`45`		`--`
	`45`	+- Fixed a KeyError when saving a FSDP sharded checkpoint and setting `save_weights_only=True` ([#19524](https://github.com/Lightning-AI/pytorch-lightning/pull/19524))
	`46`	`+`
`46`	`47`
`47`	`48`	`-`
`48`	`49`