Fix device parser logic to avoid creating CUDA context (#14319)

awaelchli · justusschock · Borda · lexierule · commit 33941cfe676e · 2022-08-31T12:11:00.000-04:00
* let environment disable forking

* add helper function and error messages

* tests

* changelog

Co-authored-by: Justus Schock &lt;12886177+justusschock@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;Borda@users.noreply.github.com&gt;
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -6,14 +6,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [1.7.4] - 2022-08-30
 
+### Added
+
+- Added an environment variable `PL_DISABLE_FORK` that can be used to disable all forking in the Trainer ([#14319](https://github.com/Lightning-AI/lightning/issues/14319))
+
 ### Fixed
 
 - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
 - Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)
 - Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113))
 
 
-
 ## [1.7.3] - 2022-08-25
 
 ### Fixed
diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py
@@ -66,6 +66,10 @@ def __init__(self, strategy: Strategy, start_method: Literal["spawn", "fork", "f
                 f"The start method '{self._start_method}' is not available on this platform. Available methods are:"
                 f" {', '.join(mp.get_all_start_methods())}"
             )
+        if start_method in ("fork", "forkserver") and _is_forking_disabled():
+            raise ValueError(
+                "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different start method."
+            )
 
     @property
     def is_interactive_compatible(self) -> bool:
@@ -270,3 +274,8 @@ def restore(self) -> None:
             torch.use_deterministic_algorithms(self.use_deterministic_algorithms)
         torch.backends.cudnn.benchmark = self.cudnn_benchmark
         _set_rng_states(self.rng_states)
+
+
+def _is_forking_disabled() -> bool:
+    """Returns whether forking is disabled through the environment variable ``PL_DISABLE_FORK``."""
+    return bool(int(os.environ.get("PL_DISABLE_FORK", "0")))
diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -74,6 +74,7 @@
     TPUSpawnStrategy,
 )
 from pytorch_lightning.strategies.ddp_spawn import _DDP_FORK_ALIASES
+from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _StrategyType,
@@ -637,6 +638,10 @@ def _check_strategy_and_fallback(self) -> None:
                 f"You selected `Trainer(strategy='{strategy_flag}')` but process forking is not supported on this"
                 f" platform. We recommed `Trainer(strategy='ddp_spawn')` instead."
             )
+        if strategy_flag in _DDP_FORK_ALIASES and _is_forking_disabled():
+            raise ValueError(
+                "Forking is disabled in this environment by `PL_DISABLE_FORKING=1`. Choose a different strategy."
+            )
         if strategy_flag:
             self._strategy_flag = strategy_flag
 
diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py
@@ -18,6 +18,7 @@
 import torch.cuda
 
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from pytorch_lightning.strategies.launchers.multiprocessing import _is_forking_disabled
 from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _DEVICE
@@ -340,7 +341,7 @@ def num_cuda_devices() -> int:
     Unlike :func:`torch.cuda.device_count`, this function will do its best not to create a CUDA context for fork
     support, if the platform allows it.
     """
-    if "fork" not in torch.multiprocessing.get_all_start_methods():
+    if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled():
         return torch.cuda.device_count()
     with multiprocessing.get_context("fork").Pool(1) as pool:
         return pool.apply(torch.cuda.device_count)
@@ -352,7 +353,7 @@ def is_cuda_available() -> bool:
     Unlike :func:`torch.cuda.is_available`, this function will do its best not to create a CUDA context for fork
     support, if the platform allows it.
     """
-    if "fork" not in torch.multiprocessing.get_all_start_methods():
+    if "fork" not in torch.multiprocessing.get_all_start_methods() or _is_forking_disabled():
         return torch.cuda.is_available()
     with multiprocessing.get_context("fork").Pool(1) as pool:
         return pool.apply(torch.cuda.is_available)
diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py
@@ -11,13 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from unittest import mock
 from unittest.mock import ANY, Mock
 
 import pytest
 import torch
 
 from pytorch_lightning.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher
+from tests_pytorch.helpers.runif import RunIf
 
 
 @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[])
@@ -26,6 +28,14 @@ def test_multiprocessing_launcher_forking_on_unsupported_platform(_):
         _MultiProcessingLauncher(strategy=Mock(), start_method="fork")
 
 
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("start_method", ["fork", "forkserver"])
+@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True)
+def test_multiprocessing_launcher_disabled_forking(start_method):
+    with pytest.raises(ValueError, match="Forking is disabled in this environment"):
+        _MultiProcessingLauncher(strategy=Mock(), start_method=start_method)
+
+
 @pytest.mark.parametrize("start_method", ["spawn", "fork"])
 @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp")
 def test_multiprocessing_launcher_start_method(mp_mock, start_method):
diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py
@@ -808,3 +808,12 @@ def test_accelerator_specific_checkpoint_io(*_):
 def test_ddp_fork_on_unsupported_platform(_, strategy):
     with pytest.raises(ValueError, match="process forking is not supported on this platform"):
         Trainer(strategy=strategy)
+
+
+@RunIf(skip_windows=True)
+@pytest.mark.parametrize("strategy", _DDP_FORK_ALIASES)
+@mock.patch.dict(os.environ, {"PL_DISABLE_FORK": "1"}, clear=True)
+def test_strategy_choice_ddp_spawn_in_interactive_when_fork_disabled(strategy):
+    """Test there is an error when forking is disabled via the environment variable and the user requests fork."""
+    with pytest.raises(ValueError, match="Forking is disabled in this environment"):
+        Trainer(devices=2, strategy=strategy)