Skip to content

Commit 70f6744

Browse files
jerome-habanapre-commit-ci[bot]carmocca
authored andcommitted
Enable all ddp params for hpu parallel strategy (#13067)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí <[email protected]>
1 parent 8d0266f commit 70f6744

File tree

4 files changed

+40
-7
lines changed

4 files changed

+40
-7
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
77

88
## [1.6.4] - 2022-06-01
99

10+
### Added
11+
12+
- Added all DDP params to be exposed through hpu parallel strategy ([#13067](https://github.com/PyTorchLightning/pytorch-lightning/pull/13067))
1013
### Fixed
1114

1215
- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885))

pytorch_lightning/strategies/ddp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def set_world_ranks(self) -> None:
204204
self.cluster_environment.set_world_size(self.num_nodes * self.num_processes)
205205
rank_zero_only.rank = self.cluster_environment.global_rank()
206206

207-
def pre_configure_ddp(self):
207+
def pre_configure_ddp(self) -> None:
208208
# if unset, default `find_unused_parameters` `True`
209209
# Many models require setting this parameter to True, as there are corner cases
210210
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.

pytorch_lightning/strategies/hpu_parallel.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
# limitations under the License.
1414
import logging
1515
import os
16-
from typing import Dict, List, Optional
16+
from typing import Any, Callable, Dict, List, Optional
1717

18-
import torch
1918
import torch.distributed
2019

2120
import pytorch_lightning as pl
2221
from pytorch_lightning.overrides import LightningDistributedModule
2322
from pytorch_lightning.overrides.torch_distributed import broadcast_object_list
23+
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
2424
from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO
2525
from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO
2626
from pytorch_lightning.plugins.precision import PrecisionPlugin
@@ -45,9 +45,15 @@ def __init__(
4545
self,
4646
accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None,
4747
parallel_devices: Optional[List[torch.device]] = None,
48+
cluster_environment: Optional[ClusterEnvironment] = None,
4849
checkpoint_io: Optional[CheckpointIO] = None,
4950
precision_plugin: Optional[PrecisionPlugin] = None,
51+
ddp_comm_state: Optional[object] = None,
52+
ddp_comm_hook: Optional[Callable] = None,
53+
ddp_comm_wrapper: Optional[Callable] = None,
54+
model_averaging_period: Optional[int] = None,
5055
process_group_backend: Optional[str] = "hccl",
56+
**kwargs: Any,
5157
) -> None:
5258

5359
if not _HPU_AVAILABLE:
@@ -56,9 +62,15 @@ def __init__(
5662
super().__init__(
5763
accelerator=accelerator,
5864
parallel_devices=parallel_devices,
65+
cluster_environment=cluster_environment,
5966
checkpoint_io=checkpoint_io or HPUCheckpointIO(),
6067
precision_plugin=precision_plugin,
68+
ddp_comm_state=ddp_comm_state,
69+
ddp_comm_hook=ddp_comm_hook,
70+
ddp_comm_wrapper=ddp_comm_wrapper,
71+
model_averaging_period=model_averaging_period,
6172
process_group_backend=process_group_backend,
73+
**kwargs,
6274
)
6375

6476
def setup_environment(self) -> None:
@@ -75,7 +87,7 @@ def setup_environment(self) -> None:
7587
def determine_ddp_device_ids(self) -> None:
7688
return None
7789

78-
def pre_configure_ddp(self): # type: ignore
90+
def _pre_configure_ddp(self) -> None:
7991
# if unset, default `find_unused_parameters` `True`
8092
# Many models require setting this parameter to True, as there are corner cases
8193
# when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True.
@@ -97,7 +109,7 @@ def configure_ddp(self) -> None:
97109
# DDP does not accept static graph as param with torch < 1.11
98110
if _TORCH_LESSER_EQUAL_1_10_2:
99111
log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
100-
self.pre_configure_ddp()
112+
self._pre_configure_ddp()
101113
self.model = self._setup_model(LightningDistributedModule(self.model)) # type: ignore
102114
if self.root_device.type == "hpu" and self._static_graph:
103115
self._model._set_static_graph() # type: ignore

tests/accelerators/test_hpu.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def test_accelerator_auto_with_devices_hpu():
196196

197197

198198
@RunIf(hpu=True)
199-
def test_strategy_choice_hpu_plugin():
199+
def test_strategy_choice_hpu_strategy():
200200
trainer = Trainer(strategy=SingleHPUStrategy(device=torch.device("hpu")), accelerator="hpu", devices=1)
201201
assert isinstance(trainer.strategy, SingleHPUStrategy)
202202

@@ -205,7 +205,7 @@ def test_strategy_choice_hpu_plugin():
205205

206206

207207
@RunIf(hpu=True)
208-
def test_strategy_choice_hpu_parallel_plugin():
208+
def test_strategy_choice_hpu_parallel_strategy():
209209
trainer = Trainer(
210210
strategy=HPUParallelStrategy(parallel_devices=[torch.device("hpu")] * 8), accelerator="hpu", devices=8
211211
)
@@ -240,3 +240,21 @@ def test_hpu_auto_device_count():
240240
def test_hpu_unsupported_device_type():
241241
with pytest.raises(MisconfigurationException, match="`devices` for `HPUAccelerator` must be int, string or None."):
242242
Trainer(accelerator="hpu", devices=[1])
243+
244+
245+
@RunIf(hpu=True)
246+
def test_strategy_params_with_hpu_parallel_strategy():
247+
bucket_cap_mb = 100
248+
gradient_as_bucket_view = True
249+
static_graph = True
250+
find_unused_parameters = True
251+
strategy = HPUParallelStrategy(
252+
bucket_cap_mb=bucket_cap_mb,
253+
gradient_as_bucket_view=gradient_as_bucket_view,
254+
static_graph=static_graph,
255+
find_unused_parameters=find_unused_parameters,
256+
)
257+
assert strategy._ddp_kwargs["bucket_cap_mb"] == bucket_cap_mb
258+
assert strategy._ddp_kwargs["gradient_as_bucket_view"] == gradient_as_bucket_view
259+
assert strategy._ddp_kwargs["static_graph"] == static_graph
260+
assert strategy._ddp_kwargs["find_unused_parameters"] == find_unused_parameters

0 commit comments

Comments
 (0)