Pass args to ShardedDataParallel (#9483)

Sean Naren · lexierule · commit a957d97a7e21 · 2021-09-14T20:17:01.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed logging of nan parameters ([#9364](https://github.com/PyTorchLightning/pytorch-lightning/pull/9364))
 - Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/PyTorchLightning/pytorch-lightning/pull/9367))
+- Pass init args to ShardedDataParallel ([#9483](https://github.com/PyTorchLightning/pytorch-lightning/pull/9483))
 
 
 ## [1.4.6] - 2021-09-07
@@ -30,6 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed inspection of other args when a container is specified in `save_hyperparameters` ([#9125](https://github.com/PyTorchLightning/pytorch-lightning/pull/9125))
 - Fixed signature of `Timer.on_train_epoch_end` and `StochasticWeightAveraging.on_train_epoch_end` to prevent unwanted deprecation warnings ([#9347](https://github.com/PyTorchLightning/pytorch-lightning/pull/9347))
 
+
 ## [1.4.5] - 2021-08-31
 
 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142))
diff --git a/pytorch_lightning/plugins/training_type/sharded.py b/pytorch_lightning/plugins/training_type/sharded.py
@@ -41,6 +41,7 @@ def configure_ddp(self):
             sharded_optimizer=self.lightning_module.trainer.optimizers,
             # For multi-node training, enabling bucketing will improve performance.
             reduce_buffer_size=self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0,
+            **self._ddp_kwargs
         )
         setattr(self._model, "require_backward_grad_sync", False)
 
diff --git a/pytorch_lightning/plugins/training_type/sharded_spawn.py b/pytorch_lightning/plugins/training_type/sharded_spawn.py
@@ -36,7 +36,9 @@ class DDPSpawnShardedPlugin(DDPSpawnPlugin):
     def configure_ddp(self):
         self._wrap_optimizers()
         self._model = ShardedDataParallel(
-            LightningShardedDataParallel(self.model), sharded_optimizer=self.lightning_module.trainer.optimizers
+            LightningShardedDataParallel(self.model),
+            sharded_optimizer=self.lightning_module.trainer.optimizers,
+            **self._ddp_kwargs
         )
         setattr(self._model, "require_backward_grad_sync", False)
 
diff --git a/tests/plugins/test_sharded_plugin.py b/tests/plugins/test_sharded_plugin.py
@@ -249,3 +249,22 @@ def test_ddp_sharded_plugin_manual_optimization(tmpdir):
     model = ManualBoringModel()
     trainer = Trainer(default_root_dir=tmpdir, accelerator="ddp_sharded", fast_dev_run=2, gpus=2)
     trainer.fit(model)
+
+
+@RunIf(skip_windows=True, fairscale=True)
+@mock.patch("pytorch_lightning.plugins.DDPShardedPlugin._wrap_optimizers", autospec=True)
+@pytest.mark.parametrize("cls", [DDPShardedPlugin, DDPSpawnShardedPlugin])
+def test_custom_kwargs_sharded(tmpdir, cls):
+    """Tests to ensure that if custom kwargs are passed, they are set correctly."""
+    plugin = cls(reduce_fp16=True)
+
+    class_name = "sharded" if isinstance(plugin, DDPShardedPlugin) else "sharded_spawn"
+
+    with mock.patch.object(plugin, "_model", autospec=True):
+        with mock.patch(
+            f"pytorch_lightning.plugins.training_type.{class_name}.ShardedDataParallel", autospec=True
+        ) as mock_sharded:
+            plugin.configure_ddp()
+    args, kwargs = mock_sharded.call_args
+    assert "reduce_fp16" in kwargs
+    assert kwargs["reduce_fp16"]

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ def configure_ddp(self):`
`41`	`41`	`sharded_optimizer=self.lightning_module.trainer.optimizers,`
`42`	`42`	`# For multi-node training, enabling bucketing will improve performance.`
`43`	`43`	`reduce_buffer_size=self._REDUCE_BUFFER_SIZE_DEFAULT if self.num_nodes > 1 else 0,`
	`44`	`+ **self._ddp_kwargs`
`44`	`45`	`)`
`45`	`46`	`setattr(self._model, "require_backward_grad_sync", False)`
`46`	`47`