Fix _module_available to detect horovod.torch properly (#12377)

JoostvDoorn · awaelchli · Borda · web-flow · commit abe795e2858b · 2022-03-28T16:00:45.000Z
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
Co-authored-by: Jirka &lt;jirka.borovec@seznam.cz&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267))
 
+- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377))
 
 - Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454))
 
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -616,7 +616,7 @@ def _handle_horovod(self) -> None:
         hvd.init()
         if isinstance(self.accelerator, GPUAccelerator):
             # Horovod assigns one local GPU per process
-            self._parallel_devices = list(range(hvd.local_size()))
+            self._parallel_devices = [torch.device(f"cuda:{i}") for i in range(hvd.local_size())]
         else:
             self._parallel_devices = [torch.device("cpu")] * hvd.local_size()
 
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
@@ -53,13 +53,9 @@ def _module_available(module_path: str) -> bool:
     if not _package_available(module_names[0]):
         return False
     try:
-        module = importlib.import_module(module_names[0])
-    except ImportError:
+        importlib.import_module(module_path)
+    except ModuleNotFoundError:
         return False
-    for name in module_names[1:]:
-        if not hasattr(module, name):
-            return False
-        module = getattr(module, name)
     return True
 
 
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
@@ -42,9 +42,10 @@
 parser = argparse.ArgumentParser()
 parser.add_argument("--trainer-options", required=True)
 parser.add_argument("--on-gpu", action="store_true", default=False)
+parser.add_argument("--check-size", action="store_true", default=False)
 
 
-def run_test_from_config(trainer_options, on_gpu, check_size=True):
+def run_test_from_config(trainer_options, on_gpu, check_size):
     """Trains the default model with the given config."""
     set_random_main_port()
     reset_seed()
@@ -107,4 +108,4 @@ def training_epoch_end(self, outputs) -> None:
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    run_test_from_config(json.loads(args.trainer_options), args.on_gpu)
+    run_test_from_config(json.loads(args.trainer_options), args.on_gpu, args.check_size)
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
@@ -41,7 +41,6 @@
 
 
 @RunIf(min_gpus=1, horovod=True)
-@pytest.mark.xfail(reason="FIXME(@Borda): nccl is not available in the GPU image")
 def test_nccl_is_available_on_gpu_environment():
     from tests.helpers.runif import _HOROVOD_NCCL_AVAILABLE
 
@@ -71,6 +70,8 @@ def _run_horovod(trainer_options):
     ]
     if trainer_options.get("accelerator", "cpu") == "gpu":
         cmdline += ["--on-gpu"]
+    if devices == 2:
+        cmdline += ["--check-size"]
     exit_code = subprocess.call(" ".join(cmdline), shell=True, env=os.environ.copy())
     assert exit_code == 0
 
@@ -93,7 +94,7 @@ def test_horovod_cpu(tmpdir):
 @RunIf(horovod=True, skip_windows=True)
 def test_horovod_cpu_accumulate_grad_batches(tmpdir):
     trainer_options = dict(
-        default_root_dir=tmpdir,
+        default_root_dir=str(tmpdir),
         enable_progress_bar=False,
         max_epochs=1,
         limit_train_batches=4,
@@ -154,7 +155,7 @@ def test_horovod_multi_gpu(tmpdir):
 @RunIf(min_gpus=2, horovod_nccl=True, skip_windows=True)
 def test_horovod_multi_gpu_accumulate_grad_batches(tmpdir):
     trainer_options = dict(
-        default_root_dir=tmpdir,
+        default_root_dir=str(tmpdir),
         enable_progress_bar=False,
         max_epochs=1,
         limit_train_batches=4,
diff --git a/tests/utilities/test_imports.py b/tests/utilities/test_imports.py
@@ -13,7 +13,16 @@
 # limitations under the License.
 import operator
 
-from pytorch_lightning.utilities import _module_available
+from pytorch_lightning.utilities import (
+    _APEX_AVAILABLE,
+    _BAGUA_AVAILABLE,
+    _DEEPSPEED_AVAILABLE,
+    _FAIRSCALE_AVAILABLE,
+    _HOROVOD_AVAILABLE,
+    _module_available,
+    _OMEGACONF_AVAILABLE,
+    _POPTORCH_AVAILABLE,
+)
 from pytorch_lightning.utilities.imports import _compare_version
 
 
@@ -45,3 +54,54 @@ def test_compare_version(monkeypatch):
     assert not _compare_version("torch", operator.ge, "1.10.0.rc0")
     assert _compare_version("torch", operator.ge, "1.10.0", use_base_version=True)
     assert not _compare_version("torch", operator.ge, "1.10.0")
+
+
+def test_imports():
+    try:
+        import apex  # noqa
+    except ModuleNotFoundError:
+        assert not _APEX_AVAILABLE
+    else:
+        assert _APEX_AVAILABLE
+
+    try:
+        import bagua  # noqa
+    except ModuleNotFoundError:
+        assert not _BAGUA_AVAILABLE
+    else:
+        assert _BAGUA_AVAILABLE
+
+    try:
+        import deepspeed  # noqa
+    except ModuleNotFoundError:
+        assert not _DEEPSPEED_AVAILABLE
+    else:
+        assert _DEEPSPEED_AVAILABLE
+
+    try:
+        import fairscale.nn  # noqa
+    except ModuleNotFoundError:
+        assert not _FAIRSCALE_AVAILABLE
+    else:
+        assert _FAIRSCALE_AVAILABLE
+
+    try:
+        import horovod.torch  # noqa
+    except ModuleNotFoundError:
+        assert not _HOROVOD_AVAILABLE
+    else:
+        assert _HOROVOD_AVAILABLE
+
+    try:
+        import omegaconf  # noqa
+    except ModuleNotFoundError:
+        assert not _OMEGACONF_AVAILABLE
+    else:
+        assert _OMEGACONF_AVAILABLE
+
+    try:
+        import poptorch  # noqa
+    except ModuleNotFoundError:
+        assert not _POPTORCH_AVAILABLE
+    else:
+        assert _POPTORCH_AVAILABLE

Original file line number	Diff line number	Diff line change
`@@ -923,6 +923,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).`
`923`	`923`
`924`	`924`	- Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267))
`925`	`925`
	`926`	`+- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377))`
`926`	`927`
`927`	`928`	`- Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454))`
`928`	`929`