Remove todo, ensure we only check rank 0 for deepspeed warning (#9311)

justusschock · lexierule · commit 3e6df2fda2d2 · 2021-09-10T09:39:24.000-04:00
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -35,7 +35,7 @@
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.types import LRSchedulerTypeTuple
-from pytorch_lightning.utilities.warnings import _warn, LightningDeprecationWarning
+from pytorch_lightning.utilities.warnings import _warn, LightningDeprecationWarning, warning_cache
 
 if _DEEPSPEED_AVAILABLE:
     import deepspeed
@@ -671,18 +671,19 @@ def save_checkpoint(self, checkpoint: Dict, filepath: str) -> None:
             checkpoint: The checkpoint state dictionary
             filepath: write-target file's path
         """
-        if self.world_size > 1 and self.zero_stage_3:
-            if self.save_full_weights:
-                # todo: expose this as general function in deepspeed
-                state_dict = self.deepspeed_engine._zero3_consolidated_fp16_state_dict()
-                if self.is_global_zero:
-                    # State dict keys will include reference to wrapper LightningDeepSpeedModule
-                    # Delete `module` prefix before saving.
-                    state_dict = {k.partition("module.")[2]: state_dict[k] for k in state_dict.keys()}
-                    checkpoint["state_dict"] = state_dict
-                    return super().save_checkpoint(checkpoint, filepath)
-                return
-
+        if self.zero_stage_3 and self._multi_device and self.is_global_zero:
+            warning_cache.warn(
+                "When saving the DeepSpeed Stage 3 checkpoint, "
+                "each worker will save a shard of the checkpoint within a directory. "
+                "If a single file is required after training, "
+                "see https://pytorch-lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#"
+                "deepspeed-zero-stage-3-single-file for instructions."
+            )
+            # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
+            # dump states as a checkpoint dictionary object
+            _exclude_keys = ["state_dict", "optimizer_states", "lr_schedulers"]
+            checkpoint = {k: v for k, v in checkpoint.items() if k not in _exclude_keys}
+            self.deepspeed_engine.save_checkpoint(filepath, client_state=checkpoint)
             # Use deepspeed's internal checkpointing function to handle partitioned weights across processes
             # dump states as a checkpoint dictionary object
             save_dir = self._filepath_to_dir(filepath)
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
@@ -419,6 +419,55 @@ def test_deepspeed_fp32_works(tmpdir):
     trainer.fit(model)
 
 
+@RunIf(min_gpus=2, deepspeed=True, special=True)
+def test_deepspeed_stage_3_save_warning(tmpdir):
+    """Test to ensure that DeepSpeed Stage 3 gives a warning when saving on rank zero."""
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
+    )
+    trainer.fit(model)
+    checkpoint_path = os.path.join(tmpdir, "model.pt")
+    with pytest.warns(UserWarning) as record:
+        # both ranks need to call save checkpoint
+        trainer.save_checkpoint(checkpoint_path)
+    if trainer.is_global_zero:
+        assert len(record) == 1
+        match = "each worker will save a shard of the checkpoint within a directory."
+        assert match in str(record[0].message)
+
+
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_multigpu_single_file(tmpdir):
+    """Test to ensure that DeepSpeed loads from a single file checkpoint."""
+    model = BoringModel()
+    checkpoint_path = os.path.join(tmpdir, "model.pt")
+    trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True)
+    trainer.fit(model)
+    trainer.save_checkpoint(checkpoint_path)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=1, fast_dev_run=True, precision=16
+    )
+    plugin = trainer.training_type_plugin
+    assert isinstance(plugin, DeepSpeedPlugin)
+    assert not plugin.load_full_weights
+    with pytest.raises(MisconfigurationException, match="DeepSpeed was unable to load the checkpoint."):
+        trainer.test(model, ckpt_path=checkpoint_path)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        plugins=[DeepSpeedPlugin(stage=3, load_full_weights=True)],
+        gpus=1,
+        fast_dev_run=True,
+        precision=16,
+    )
+    plugin = trainer.training_type_plugin
+    assert isinstance(plugin, DeepSpeedPlugin)
+    assert plugin.load_full_weights
+    trainer.test(model, ckpt_path=checkpoint_path)
+
+
 class ModelParallelClassificationModel(LightningModule):
     def __init__(self, lr: float = 0.01, num_blocks: int = 5):
         super().__init__()
diff --git a/tests/trainer/loops/test_training_loop.py b/tests/trainer/loops/test_training_loop.py
@@ -193,7 +193,7 @@ def training_epoch_end(self, outputs) -> None:
 
 
 def test_batch_loop_releases_loss(tmpdir):
-    """Test that loss/graph is released so that it can be garbage collected before the next training step"""
+    """Test that loss/graph is released so that it can be garbage collected before the next training step."""
 
     class TestModel(BoringModel):
         def training_step(self, batch, batch_idx):