Skip to content

Commit abe795e

Browse files
JoostvDoornawaelchliBorda
authored
Fix _module_available to detect horovod.torch properly (#12377)
Co-authored-by: Adrian Wälchli <[email protected]> Co-authored-by: Jirka Borovec <[email protected]> Co-authored-by: Jirka <[email protected]>
1 parent 31be799 commit abe795e

File tree

6 files changed

+72
-13
lines changed

6 files changed

+72
-13
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
923923

924924
- Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267))
925925

926+
- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377))
926927

927928
- Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454))
928929

pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ def _handle_horovod(self) -> None:
616616
hvd.init()
617617
if isinstance(self.accelerator, GPUAccelerator):
618618
# Horovod assigns one local GPU per process
619-
self._parallel_devices = list(range(hvd.local_size()))
619+
self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())]
620620
else:
621621
self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
622622

pytorch_lightning/utilities/imports.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,9 @@ def _module_available(module_path: str) -> bool:
5353
if not _package_available(module_names[0]):
5454
return False
5555
try:
56-
module = importlib.import_module(module_names[0])
57-
except ImportError:
56+
importlib.import_module(module_path)
57+
except ModuleNotFoundError:
5858
return False
59-
for name in module_names[1:]:
60-
if not hasattr(module, name):
61-
return False
62-
module = getattr(module, name)
6359
return True
6460

6561

tests/models/data/horovod/train_default_model.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@
4242
parser = argparse.ArgumentParser()
4343
parser.add_argument("--trainer-options", required=True)
4444
parser.add_argument("--on-gpu", action="store_true", default=False)
45+
parser.add_argument("--check-size", action="store_true", default=False)
4546

4647

47-
def run_test_from_config(trainer_options, on_gpu, check_size=True):
48+
def run_test_from_config(trainer_options, on_gpu, check_size):
4849
"""Trains the default model with the given config."""
4950
set_random_main_port()
5051
reset_seed()
@@ -107,4 +108,4 @@ def training_epoch_end(self, outputs) -> None:
107108

108109
if __name__ == "__main__":
109110
args = parser.parse_args()
110-
run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
111+
run_test_from_config(json.loads(args.trainer_options), args.on_gpu, args.check_size)

tests/models/test_horovod.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141

4242

4343
@RunIf(min_gpus=1, horovod=True)
44-
@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
4544
def test_nccl_is_available_on_gpu_environment():
4645
from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
4746

@@ -71,6 +70,8 @@ def _run_horovod(trainer_options):
7170
]
7271
if trainer_options.get("accelerator", "cpu") == "gpu":
7372
cmdline += ["--on-gpu"]
73+
if devices == 2:
74+
cmdline += ["--check-size"]
7475
exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy())
7576
assert exit_code == 0
7677

@@ -93,7 +94,7 @@ def test_horovod_cpu(tmpdir):
9394
@RunIf(horovod=True, skip_windows=True)
9495
def test_horovod_cpu_accumulate_grad_batches(tmpdir):
9596
trainer_options = dict(
96-
default_root_dir=tmpdir,
97+
default_root_dir=str(tmpdir),
9798
enable_progress_bar=False,
9899
max_epochs=1,
99100
limit_train_batches=4,
@@ -154,7 +155,7 @@ def test_horovod_multi_gpu(tmpdir):
154155
@RunIf(min_gpus=2, horovod_nccl=True, skip_windows=True)
155156
def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
156157
trainer_options = dict(
157-
default_root_dir=tmpdir,
158+
default_root_dir=str(tmpdir),
158159
enable_progress_bar=False,
159160
max_epochs=1,
160161
limit_train_batches=4,

tests/utilities/test_imports.py

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,16 @@
1313
# limitations under the License.
1414
import operator
1515

16-
from pytorch_lightning.utilities import _module_available
16+
from pytorch_lightning.utilities import (
17+
_APEX_AVAILABLE,
18+
_BAGUA_AVAILABLE,
19+
_DEEPSPEED_AVAILABLE,
20+
_FAIRSCALE_AVAILABLE,
21+
_HOROVOD_AVAILABLE,
22+
_module_available,
23+
_OMEGACONF_AVAILABLE,
24+
_POPTORCH_AVAILABLE,
25+
)
1726
from pytorch_lightning.utilities.imports import _compare_version
1827

1928

@@ -45,3 +54,54 @@ def test_compare_version(monkeypatch):
4554
assert not _compare_version("torch", operator.ge, "1.10.0.rc0")
4655
assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
4756
assert not _compare_version("torch", operator.ge, "1.10.0")
57+
58+
59+
def test_imports():
60+
try:
61+
import apex # noqa
62+
except ModuleNotFoundError:
63+
assert not _APEX_AVAILABLE
64+
else:
65+
assert _APEX_AVAILABLE
66+
67+
try:
68+
import bagua # noqa
69+
except ModuleNotFoundError:
70+
assert not _BAGUA_AVAILABLE
71+
else:
72+
assert _BAGUA_AVAILABLE
73+
74+
try:
75+
import deepspeed # noqa
76+
except ModuleNotFoundError:
77+
assert not _DEEPSPEED_AVAILABLE
78+
else:
79+
assert _DEEPSPEED_AVAILABLE
80+
81+
try:
82+
import fairscale.nn # noqa
83+
except ModuleNotFoundError:
84+
assert not _FAIRSCALE_AVAILABLE
85+
else:
86+
assert _FAIRSCALE_AVAILABLE
87+
88+
try:
89+
import horovod.torch # noqa
90+
except ModuleNotFoundError:
91+
assert not _HOROVOD_AVAILABLE
92+
else:
93+
assert _HOROVOD_AVAILABLE
94+
95+
try:
96+
import omegaconf # noqa
97+
except ModuleNotFoundError:
98+
assert not _OMEGACONF_AVAILABLE
99+
else:
100+
assert _OMEGACONF_AVAILABLE
101+
102+
try:
103+
import poptorch # noqa
104+
except ModuleNotFoundError:
105+
assert not _POPTORCH_AVAILABLE
106+
else:
107+
assert _POPTORCH_AVAILABLE

0 commit comments

Comments
 (0)