Enable all ddp params for hpu parallel strategy (#13067)

jerome-habana · pre-commit-ci[bot] · carmocca · lexierule · commit 70f674429871 · 2022-06-01T08:04:16.000-04:00
Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Carlos Mocholí &lt;carlossmocholi@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.6.4] - 2022-06-01
 
+### Added
+
+- Added all DDP params to be exposed through hpu parallel strategy ([#13067](https://github.com/PyTorchLightning/pytorch-lightning/pull/13067))
 ### Fixed
 
 - Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))
diff --git a/pytorch_lightning/strategies/ddp.py b/pytorch_lightning/strategies/ddp.py
@@ -204,7 +204,7 @@ def set_world_ranks(self) -> None:
         self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
         rank_zero_only.rank = self.cluster_environment.global_rank()
 
-    def pre_configure_ddp(self):
+    def pre_configure_ddp(self) -> None:
         # if unset, default `find_unused_parameters` `True`
         # Many models require setting this parameter to True, as there are corner cases
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
diff --git a/pytorch_lightning/strategies/hpu_parallel.py b/pytorch_lightning/strategies/hpu_parallel.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 import logging
 import os
-from typing import Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
-import torch
 import torch.distributed
 
 import pytorch_lightning as pl
 from pytorch_lightning.overrides import LightningDistributedModule
 from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
+from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
 from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
 from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
 from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -45,9 +45,15 @@ def __init__(
         self,
         accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
         parallel_devices: Optional[List[torch.device]] = None,
+        cluster_environment: Optional[ClusterEnvironment] = None,
         checkpoint_io: Optional[CheckpointIO] = None,
         precision_plugin: Optional[PrecisionPlugin] = None,
+        ddp_comm_state: Optional[object] = None,
+        ddp_comm_hook: Optional[Callable] = None,
+        ddp_comm_wrapper: Optional[Callable] = None,
+        model_averaging_period: Optional[int] = None,
         process_group_backend: Optional[str] = "hccl",
+        **kwargs: Any,
     ) -> None:
 
         if not _HPU_AVAILABLE:
@@ -56,9 +62,15 @@ def __init__(
         super().__init__(
             accelerator=accelerator,
             parallel_devices=parallel_devices,
+            cluster_environment=cluster_environment,
             checkpoint_io=checkpoint_io or HPUCheckpointIO(),
             precision_plugin=precision_plugin,
+            ddp_comm_state=ddp_comm_state,
+            ddp_comm_hook=ddp_comm_hook,
+            ddp_comm_wrapper=ddp_comm_wrapper,
+            model_averaging_period=model_averaging_period,
             process_group_backend=process_group_backend,
+            **kwargs,
         )
 
     def setup_environment(self) -> None:
@@ -75,7 +87,7 @@ def setup_environment(self) -> None:
     def determine_ddp_device_ids(self) -> None:
         return None
 
-    def pre_configure_ddp(self):  # type: ignore
+    def _pre_configure_ddp(self) -> None:
         # if unset, default `find_unused_parameters` `True`
         # Many models require setting this parameter to True, as there are corner cases
         # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
@@ -97,7 +109,7 @@ def configure_ddp(self) -> None:
         # DDP does not accept static graph as param with torch < 1.11
         if _TORCH_LESSER_EQUAL_1_10_2:
             log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
-            self.pre_configure_ddp()
+            self._pre_configure_ddp()
             self.model = self._setup_model(LightningDistributedModule(self.model))  # type: ignore
             if self.root_device.type == "hpu" and self._static_graph:
                 self._model._set_static_graph()  # type: ignore
diff --git a/tests/accelerators/test_hpu.py b/tests/accelerators/test_hpu.py
@@ -196,7 +196,7 @@ def test_accelerator_auto_with_devices_hpu():
 
 
 @RunIf(hpu=True)
-def test_strategy_choice_hpu_plugin():
+def test_strategy_choice_hpu_strategy():
     trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
     assert isinstance(trainer.strategy, SingleHPUStrategy)
 
@@ -205,7 +205,7 @@ def test_strategy_choice_hpu_plugin():
 
 
 @RunIf(hpu=True)
-def test_strategy_choice_hpu_parallel_plugin():
+def test_strategy_choice_hpu_parallel_strategy():
     trainer = Trainer(
         strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8
     )
@@ -240,3 +240,21 @@ def test_hpu_auto_device_count():
 def test_hpu_unsupported_device_type():
     with pytest.raises(MisconfigurationException, match="`devices` for `HPUAccelerator` must be int, string or None."):
         Trainer(accelerator="hpu", devices=[1])
+
+
+@RunIf(hpu=True)
+def test_strategy_params_with_hpu_parallel_strategy():
+    bucket_cap_mb = 100
+    gradient_as_bucket_view = True
+    static_graph = True
+    find_unused_parameters = True
+    strategy = HPUParallelStrategy(
+        bucket_cap_mb=bucket_cap_mb,
+        gradient_as_bucket_view=gradient_as_bucket_view,
+        static_graph=static_graph,
+        find_unused_parameters=find_unused_parameters,
+    )
+    assert strategy._ddp_kwargs["bucket_cap_mb"] == bucket_cap_mb
+    assert strategy._ddp_kwargs["gradient_as_bucket_view"] == gradient_as_bucket_view
+    assert strategy._ddp_kwargs["static_graph"] == static_graph
+    assert strategy._ddp_kwargs["find_unused_parameters"] == find_unused_parameters