Introduce parameter to fix deepspeed crash for RNNS (#9489)

Sean Naren · lexierule · commit 7049f091f3e9 · 2021-09-14T20:17:01.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/PyTorchLightning/pytorch-lightning/pull/9367))
 - Pass init args to ShardedDataParallel ([#9483](https://github.com/PyTorchLightning/pytorch-lightning/pull/9483))
 - Fixed collision of user argument when using ShardedDDP ([#9512](https://github.com/PyTorchLightning/pytorch-lightning/pull/9512))
+- Fixed DeepSpeed crash for RNNs ([#9489](https://github.com/PyTorchLightning/pytorch-lightning/pull/9489))
 
 
 ## [1.4.6] - 2021-09-07
diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -123,6 +123,7 @@ def __init__(
         cpu_offload: bool = False,
         cpu_offload_params: bool = False,
         cpu_offload_use_pin_memory: bool = False,
+        partition_module: bool = True,
     ) -> None:
         """
         Provides capabilities to run training using the DeepSpeed library,
@@ -254,6 +255,12 @@ def __init__(
                 when using ZeRO Stage 3. This allows a single weight file to contain the entire model,
                 rather than individual sharded weight files.
                 Disable to save sharded states individually.
+
+            partition_module: When True, partitions the ``LightningModule`` across devices when using ZeRO Stage 3.
+                This is the default behaviour to ensure that the entire module is appropriately initialized
+                for DeepSpeed. When False we do not explicitly convert the model, which is fine if NO layers
+                or ALL layers are defined in ``configure_sharded_model``. This is useful for layers such as
+                ``torch.nn.RNN`` which do internal logic when moving to device.
         """
         if not _DEEPSPEED_AVAILABLE:
             raise MisconfigurationException(
@@ -314,6 +321,7 @@ def __init__(
 
         self.remote_device = remote_device
         self.save_full_weights = save_full_weights
+        self.partition_module = partition_module
 
         # default FP16 parameters.
         self.loss_scale = loss_scale
@@ -375,7 +383,7 @@ def init_deepspeed(self):
         precision = self.lightning_module.trainer.accelerator.precision
         model = LightningDeepSpeedModule(pl_module=self.model, precision=precision)
 
-        if self.zero_stage_3:
+        if self.zero_stage_3 and self.partition_module:
             # Ensure the entire model has been moved to the appropriate device
             dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32
             deepspeed.zero.Init(
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
@@ -643,17 +643,66 @@ def on_train_batch_start(
 
 
 @RunIf(min_gpus=2, deepspeed=True, special=True)
-def test_deepspeed_multigpu_test(tmpdir, deepspeed_config):
-    """
-    Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3.
-    """
+def test_deepspeed_multigpu_test(tmpdir):
+    """Test to ensure we can use DeepSpeed with just test using ZeRO Stage 3."""
     model = ModelParallelBoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=2, fast_dev_run=True, precision=16
     )
     trainer.test(model)
 
 
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_multigpu_partial_partition_parameters(tmpdir):
+    """Test to ensure that a module that defines a layer inside the ``__init__`` and ``configure_sharded_model``
+    correctly converts all parameters to float16 when ``precision=16`` and runs successfully."""
+
+    class TestModel(ModelParallelBoringModel):
+        def __init__(self):
+            super().__init__()
+            self.layer_2 = torch.nn.Linear(32, 32)
+
+        def configure_sharded_model(self) -> None:
+            self.layer = torch.nn.Linear(32, 2)
+
+        def forward(self, x):
+            x = self.layer_2(x)
+            return self.layer(x)
+
+        def on_train_epoch_start(self) -> None:
+            assert all([x.dtype == torch.float16 for x in self.parameters()])
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir, plugins=[DeepSpeedPlugin(stage=3)], gpus=1, fast_dev_run=True, precision=16
+    )
+    trainer.fit(model)
+
+
+@RunIf(min_gpus=1, deepspeed=True, special=True)
+def test_deepspeed_multigpu_test_rnn(tmpdir):
+    """Test to ensure that turning off explicit partitioning of the entire module for ZeRO Stage 3 works when
+    training with certain layers which will crash with explicit partitioning."""
+
+    class TestModel(BoringModel):
+        def __init__(self):
+            super().__init__()
+            self.rnn = torch.nn.GRU(32, 32)
+
+        def on_train_epoch_start(self) -> None:
+            assert all([x.dtype == torch.float16 for x in self.parameters()])
+
+    model = TestModel()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        plugins=[DeepSpeedPlugin(stage=3, partition_module=False)],
+        gpus=1,
+        fast_dev_run=True,
+        precision=16,
+    )
+    trainer.fit(model)
+
+
 @RunIf(deepspeed=True)
 @mock.patch("deepspeed.init_distributed", autospec=True)
 @pytest.mark.parametrize("platform", ["Linux", "Windows"])