Restore support for builds without distributed (#18859)

carmocca · awaelchli · lantiga · commit db5a7dbbfb28 · 2023-11-06T10:21:58.000-05:00
Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> (cherry picked from commit 78ad390)
diff --git a/src/lightning/fabric/plugins/collectives/torch_collective.py b/src/lightning/fabric/plugins/collectives/torch_collective.py
@@ -167,7 +167,7 @@ def is_available(cls) -> bool:
 
     @classmethod
     def is_initialized(cls) -> bool:
-        return dist.is_initialized()
+        return cls.is_available() and dist.is_initialized()
 
     @classmethod
     def init_group(cls, **kwargs: Any) -> None:
diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
@@ -33,6 +33,7 @@
 from lightning.fabric.strategies.strategy import TBroadcast, _BackwardSyncControl
 from lightning.fabric.utilities.distributed import (
     ReduceOp,
+    _distributed_is_initialized,
     _get_default_process_group_backend_for_device,
     _init_dist_connection,
     _sync_ddp_if_available,
@@ -143,15 +144,15 @@ def all_reduce(
         return tensor
 
     def barrier(self, *args: Any, **kwargs: Any) -> None:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return
         if torch.distributed.get_backend() == "nccl":
             torch.distributed.barrier(device_ids=self._determine_ddp_device_ids())
         else:
             torch.distributed.barrier()
 
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return obj
 
         obj = [obj]
diff --git a/src/lightning/fabric/strategies/fsdp.py b/src/lightning/fabric/strategies/fsdp.py
@@ -54,6 +54,7 @@
 )
 from lightning.fabric.utilities.distributed import (
     ReduceOp,
+    _distributed_is_initialized,
     _get_default_process_group_backend_for_device,
     _init_dist_connection,
     _sync_ddp_if_available,
@@ -355,15 +356,15 @@ def all_reduce(
         return tensor
 
     def barrier(self, *args: Any, **kwargs: Any) -> None:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return
         if torch.distributed.get_backend() == "nccl":
             torch.distributed.barrier(device_ids=[self.root_device.index])
         else:
             torch.distributed.barrier()
 
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return obj
 
         obj = [obj]
diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py
@@ -167,7 +167,7 @@ def _sync_ddp_if_available(
         reduced value
 
     """
-    if torch.distributed.is_initialized():
+    if _distributed_is_initialized():
         return _sync_ddp(result, group=group, reduce_op=reduce_op)
     return result
 
@@ -244,7 +244,7 @@ def _all_gather_ddp_if_available(
         A tensor of shape (world_size, batch, ...)
 
     """
-    if not torch.distributed.is_initialized():
+    if not _distributed_is_initialized():
         return tensor
 
     from torch.distributed.nn.functional import all_gather
@@ -373,3 +373,10 @@ def _set_num_threads_if_needed(num_processes: int = 1) -> None:
         num_threads = _suggested_max_num_threads(num_processes)
         torch.set_num_threads(num_threads)
         os.environ["OMP_NUM_THREADS"] = str(num_threads)
+
+
+def _distributed_is_initialized() -> bool:
+    # `is_initialized` is only defined conditionally
+    # https://github.com/pytorch/pytorch/blob/v2.1.0/torch/distributed/__init__.py#L25
+    # this might happen to MacOS builds from source (default) or any build from source that sets `USE_DISTRIBUTED=0`
+    return torch.distributed.is_available() and torch.distributed.is_initialized()
diff --git a/src/lightning/pytorch/loops/utilities.py b/src/lightning/pytorch/loops/utilities.py
@@ -20,6 +20,7 @@
 from torch import Tensor
 
 import lightning.pytorch as pl
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0, _TORCH_GREATER_EQUAL_1_13
 from lightning.fabric.utilities.warnings import PossibleUserWarning
 from lightning.pytorch.accelerators.xla import XLAAccelerator
@@ -160,7 +161,7 @@ def _decorator(self: _Loop, *args: Any, **kwargs: Any) -> Any:
         if not hasattr(self, "inference_mode"):
             raise TypeError(f"`{type(self).__name__}.inference_mode` needs to be defined")
         context_manager: Type[ContextManager]
-        if dist.is_available() and dist.is_initialized() and dist.get_backend() == "gloo":  # noqa: SIM114
+        if _distributed_is_initialized() and dist.get_backend() == "gloo":  # noqa: SIM114
             # gloo backend does not work properly.
             # https://github.com/Lightning-AI/lightning/pull/12715/files#r854569110
             # TODO: explore why and possibly open an issue in PyTorch repository
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
@@ -28,6 +28,7 @@
 from lightning.fabric.plugins.collectives.torch_collective import default_pg_timeout
 from lightning.fabric.strategies import _StrategyRegistry
 from lightning.fabric.utilities.distributed import (
+    _distributed_is_initialized,
     _get_default_process_group_backend_for_device,
     _init_dist_connection,
     _sync_ddp_if_available,
@@ -282,7 +283,7 @@ def determine_ddp_device_ids(self) -> Optional[List[int]]:
         return [self.root_device.index]
 
     def barrier(self, *args: Any, **kwargs: Any) -> None:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return
 
         if torch.distributed.get_backend() == "nccl":
@@ -291,7 +292,7 @@ def barrier(self, *args: Any, **kwargs: Any) -> None:
             torch.distributed.barrier()
 
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return obj
 
         obj = [obj]
diff --git a/src/lightning/pytorch/strategies/fsdp.py b/src/lightning/pytorch/strategies/fsdp.py
@@ -43,6 +43,7 @@
     _setup_activation_checkpointing,
 )
 from lightning.fabric.utilities.distributed import (
+    _distributed_is_initialized,
     _get_default_process_group_backend_for_device,
     _init_dist_connection,
     _sync_ddp_if_available,
@@ -382,15 +383,15 @@ def model_sharded_context(self) -> Generator[None, None, None]:
             yield
 
     def barrier(self, name: Optional[str] = None) -> None:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return
         if torch.distributed.get_backend() == "nccl":
             torch.distributed.barrier(device_ids=self._determine_device_ids())
         else:
             torch.distributed.barrier()
 
     def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
-        if not torch.distributed.is_initialized():
+        if not _distributed_is_initialized():
             return obj
 
         obj = [obj]
diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py
@@ -23,6 +23,7 @@
 
 from lightning.fabric.utilities import move_data_to_device
 from lightning.fabric.utilities.apply_func import convert_tensors_to_scalars
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 from lightning.fabric.utilities.imports import _TORCH_EQUAL_2_0, _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch.utilities.data import extract_batch_size
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
@@ -425,7 +426,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]:
         elif not on_step and result_metric.meta.on_epoch:
             if result_metric._computed is None:
                 should = result_metric.meta.sync.should
-                if not should and result_metric.is_tensor and torch.distributed.is_initialized():
+                if not should and result_metric.is_tensor and _distributed_is_initialized():
                     warning_cache.warn(
                         f"It is recommended to use `self.log({result_metric.meta.name!r}, ..., sync_dist=True)`"
                         " when logging on epoch level in distributed setting to accumulate the metric across"
diff --git a/tests/tests_fabric/conftest.py b/tests/tests_fabric/conftest.py
@@ -19,6 +19,7 @@
 import lightning.fabric
 import pytest
 import torch.distributed
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 
 
 @pytest.fixture(autouse=True)
@@ -71,7 +72,7 @@ def restore_env_variables():
 def teardown_process_group():
     """Ensures that the distributed process group gets closed before the next test runs."""
     yield
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
+    if _distributed_is_initialized():
         torch.distributed.destroy_process_group()
 
 
diff --git a/tests/tests_fabric/plugins/collectives/test_single_device.py b/tests/tests_fabric/plugins/collectives/test_single_device.py
@@ -10,7 +10,6 @@ def test_can_instantiate_without_args():
 
 def test_create_group():
     collective = SingleDeviceCollective()
-    assert collective.is_available()
     assert collective.is_initialized()
 
     with pytest.raises(RuntimeError, match=r"SingleDeviceCollective` does not own a group"):
diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py
@@ -26,6 +26,7 @@
 import pytest
 import torch.distributed
 from lightning.fabric.plugins.environments.lightning import find_free_network_port
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 from lightning.fabric.utilities.imports import _IS_WINDOWS
 from lightning.pytorch.trainer.connectors.signal_connector import _SignalConnector
 
@@ -112,7 +113,7 @@ def restore_signal_handlers():
 def teardown_process_group():
     """Ensures that the distributed process group gets closed before the next test runs."""
     yield
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
+    if _distributed_is_initialized():
         torch.distributed.destroy_process_group()
 
 
@@ -244,7 +245,7 @@ def single_process_pg():
     The process group is destroyed when the with block is exited.
 
     """
-    if torch.distributed.is_initialized():
+    if _distributed_is_initialized():
         raise RuntimeError("Can't use `single_process_pg` when the default process group is already initialized.")
 
     orig_environ = os.environ.copy()
diff --git a/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py b/tests/tests_pytorch/strategies/launchers/test_subprocess_script.py
@@ -22,6 +22,7 @@
 import os
 import torch
 
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 from lightning.pytorch import Trainer
 from lightning.pytorch.demos.boring_classes import BoringModel
 
@@ -37,7 +38,7 @@ def task_fn(cfg):
     trainer.fit(model)
     trainer.test(model)
 
-    if torch.distributed.is_initialized():
+    if _distributed_is_initialized():
         torch.distributed.destroy_process_group()
 
     os.environ.pop("LOCAL_RANK", None)
diff --git a/tests/tests_pytorch/strategies/test_ddp_integration.py b/tests/tests_pytorch/strategies/test_ddp_integration.py
@@ -21,6 +21,7 @@
 import pytest
 import torch
 from lightning.fabric.plugins.environments import ClusterEnvironment, LightningEnvironment
+from lightning.fabric.utilities.distributed import _distributed_is_initialized
 from lightning.fabric.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_2_0
 from lightning.pytorch import Trainer
 from lightning.pytorch.callbacks import Callback, EarlyStopping
@@ -78,7 +79,7 @@ def test_ddp_torch_dist_is_available_in_setup(_, __, ___, cuda_count_1, mps_coun
 
     class TestModel(BoringModel):
         def setup(self, stage: str) -> None:
-            assert torch.distributed.is_initialized()
+            assert _distributed_is_initialized()
             raise SystemExit()
 
     model = TestModel()