Support auto_select_gpus with accelerator and devices api (#12608)

kaushikb11 · awaelchli · lexierule · commit 2187f309c154 · 2022-04-13T14:14:29.000-04:00
Co-authored-by: Adrian Wälchli &lt;aedu.waelchli@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -94,6 +94,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Raise `MisconfigurationException` when the accelerator is available but the user passes invalid `([]/0/"0")` values to the `devices` flag ([#12708](https://github.com/PyTorchLightning/pytorch-lightning/pull/12708))
 
 
+- Support `auto_select_gpus` with the accelerator and devices API ([#12608](https://github.com/PyTorchLightning/pytorch-lightning/pull/12608))
+
+
 ## [1.6.0] - 2022-03-29
 
 ### Added
diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
@@ -38,7 +38,6 @@
     _update_dataloader,
     has_iterable_dataset,
 )
-from pytorch_lightning.utilities.device_parser import _parse_devices
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import seed_everything
 
@@ -80,7 +79,6 @@ def __init__(
     ) -> None:
         self._check_accelerator_support(accelerator)
         self._check_strategy_support(strategy)
-        gpu_ids, tpu_cores = _parse_devices(gpus=gpus, auto_select_gpus=False, tpu_cores=tpu_cores)
         self._accelerator_connector = AcceleratorConnector(
             num_processes=None,
             devices=devices,
@@ -89,7 +87,6 @@ def __init__(
             accelerator=accelerator,
             strategy=strategy,
             gpus=gpus,
-            gpu_ids=gpu_ids,
             num_nodes=num_nodes,
             sync_batchnorm=False,  # TODO: add support?
             benchmark=False,
@@ -99,6 +96,7 @@ def __init__(
             amp_type="native",
             amp_level=None,
             plugins=plugins,
+            auto_select_gpus=False,
         )
         self._strategy = self._accelerator_connector.strategy
         self._accelerator = self._strategy.accelerator
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -69,6 +69,7 @@
     StrategyRegistry,
     TPUSpawnStrategy,
 )
+from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
 from pytorch_lightning.utilities import (
     _StrategyType,
     AMPType,
@@ -102,11 +103,11 @@ def __init__(
         benchmark: Optional[bool] = None,
         replace_sampler_ddp: bool = True,
         deterministic: bool = False,
+        auto_select_gpus: bool = False,
         num_processes: Optional[int] = None,  # deprecated
-        tpu_cores: Optional[Union[List[int], int]] = None,  # deprecated
+        tpu_cores: Optional[Union[List[int], str, int]] = None,  # deprecated
         ipus: Optional[int] = None,  # deprecated
         gpus: Optional[Union[List[int], str, int]] = None,  # deprecated
-        gpu_ids: Optional[List[int]] = None,  # TODO can be removed
     ) -> None:
         """The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other
         components such as the Accelerator and Precision plugins.
@@ -173,6 +174,7 @@ def __init__(
         self.checkpoint_io: Optional[CheckpointIO] = None
         self._amp_type_flag: Optional[LightningEnum] = None
         self._amp_level_flag: Optional[str] = amp_level
+        self._auto_select_gpus: bool = auto_select_gpus
 
         self._check_config_and_set_final_flags(
             strategy=strategy,
@@ -408,7 +410,7 @@ def _check_device_config_and_set_final_flags(
         num_processes: Optional[int],
         gpus: Optional[Union[List[int], str, int]],
         ipus: Optional[int],
-        tpu_cores: Optional[Union[List[int], int]],
+        tpu_cores: Optional[Union[List[int], str, int]],
     ) -> None:
         self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1
         self._devices_flag = devices
@@ -521,6 +523,8 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
         self._gpus = self._devices_flag if not self._gpus else self._gpus
         self._tpu_cores = self._devices_flag if not self._tpu_cores else self._tpu_cores
 
+        self._set_devices_flag_if_auto_select_gpus_passed()
+
         self._devices_flag = self.accelerator.parse_devices(self._devices_flag)
         if not self._parallel_devices:
             self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag)
@@ -529,6 +533,11 @@ def _set_devices_flag_if_auto_passed(self) -> None:
         if self._devices_flag == "auto" or self._devices_flag is None:
             self._devices_flag = self.accelerator.auto_device_count()
 
+    def _set_devices_flag_if_auto_select_gpus_passed(self) -> None:
+        if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, GPUAccelerator):
+            self._devices_flag = pick_multiple_gpus(self._gpus)
+            log.info(f"Auto select gpus: {self._devices_flag}")
+
     def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
         if isinstance(self._cluster_environment_flag, ClusterEnvironment):
             return self._cluster_environment_flag
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -22,7 +22,7 @@
 from copy import deepcopy
 from datetime import timedelta
 from pathlib import Path
-from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Type, Union
 from weakref import proxy
 
 import torch
@@ -81,7 +81,6 @@
     _IPU_AVAILABLE,
     _TPU_AVAILABLE,
     AMPType,
-    device_parser,
     GradClipAlgorithmType,
     parsing,
 )
@@ -225,7 +224,7 @@ def __init__(
                 a power search or `binsearch` that estimates the batch size through a binary search.
                 Default: ``False``.
 
-            auto_select_gpus: If enabled and ``gpus`` is an integer, pick available
+            auto_select_gpus: If enabled and ``gpus`` or ``devices`` is an integer, pick available
                 gpus automatically. This is especially useful when
                 GPUs are configured to be in "exclusive mode", such
                 that only one process at a time can access them.
@@ -478,8 +477,6 @@ def __init__(
         log.detail(f"{self.__class__.__name__}: Initializing trainer with parameters: {locals()}")
         self.state = TrainerState()
 
-        gpu_ids, tpu_cores = self._parse_devices(gpus, auto_select_gpus, tpu_cores)
-
         # init connectors
         self._data_connector = DataConnector(self, multiple_trainloader_mode)
 
@@ -491,12 +488,12 @@ def __init__(
             accelerator=accelerator,
             strategy=strategy,
             gpus=gpus,
-            gpu_ids=gpu_ids,
             num_nodes=num_nodes,
             sync_batchnorm=sync_batchnorm,
             benchmark=benchmark,
             replace_sampler_ddp=replace_sampler_ddp,
             deterministic=deterministic,
+            auto_select_gpus=auto_select_gpus,
             precision=precision,
             amp_type=amp_backend,
             amp_level=amp_level,
@@ -1770,14 +1767,6 @@ def _call_strategy_hook(
 
         return output
 
-    @staticmethod
-    def _parse_devices(
-        gpus: Optional[Union[List[int], str, int]],
-        auto_select_gpus: bool,
-        tpu_cores: Optional[Union[List[int], str, int]],
-    ) -> Tuple[Optional[List[int]], Optional[Union[List[int], int]]]:
-        return device_parser._parse_devices(gpus, auto_select_gpus, tpu_cores)
-
     @staticmethod
     def _log_api_event(event: str) -> None:
         torch._C._log_api_usage_once("lightning.trainer." + event)
diff --git a/pytorch_lightning/tuner/auto_gpu_select.py b/pytorch_lightning/tuner/auto_gpu_select.py
@@ -22,15 +22,19 @@ def pick_multiple_gpus(nb: int) -> List[int]:
     """
     Raises:
         MisconfigurationException:
-            If ``gpus`` is set to 0, when ``auto_select_gpus=True``.
+            If ``gpus`` or ``devices`` is set to 0, when ``auto_select_gpus=True``, or when the requested number is
+            higher than the number of GPUs available on the machine.
     """
     if nb == 0:
         raise MisconfigurationException(
             "auto_select_gpus=True, gpus=0 is not a valid configuration."
             " Please select a valid number of GPU resources when using auto_select_gpus."
         )
 
-    nb = torch.cuda.device_count() if nb == -1 else nb
+    num_gpus = torch.cuda.device_count()
+    if nb > num_gpus:
+        raise MisconfigurationException(f"You requested {nb} GPUs but your machine only has {num_gpus} GPUs.")
+    nb = num_gpus if nb == -1 else nb
 
     picked: List[int] = []
     for _ in range(nb):
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -494,15 +494,19 @@ def test_accelerator_cpu(_):
     trainer = Trainer(accelerator="cpu")
     assert isinstance(trainer.accelerator, CPUAccelerator)
 
-    with pytest.raises(MisconfigurationException, match="You requested gpu:"):
-        trainer = Trainer(gpus=1)
     with pytest.raises(
         MisconfigurationException,
         match="GPUAccelerator can not run on your system since the accelerator is not available.",
     ):
-        trainer = Trainer(accelerator="gpu")
-    with pytest.raises(MisconfigurationException, match="You requested gpu:"):
-        trainer = Trainer(accelerator="cpu", gpus=1)
+        with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"):
+            Trainer(gpus=1)
+    with pytest.raises(
+        MisconfigurationException,
+        match="GPUAccelerator can not run on your system since the accelerator is not available.",
+    ):
+        Trainer(accelerator="gpu")
+
+    Trainer(accelerator="cpu", gpus=1)
 
 
 @RunIf(min_gpus=1)
diff --git a/tests/trainer/properties/test_auto_gpu_select.py b/tests/trainer/properties/test_auto_gpu_select.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
+from unittest import mock
 
 import pytest
 import torch
@@ -22,26 +23,6 @@
 from tests.helpers.runif import RunIf
 
 
-# TODO: add pytest.deprecated_call @daniellepintz
-@RunIf(min_gpus=2)
-@pytest.mark.parametrize(
-    ["auto_select_gpus", "gpus", "expected_error"],
-    [(True, 0, MisconfigurationException), (True, -1, None), (False, 0, None), (False, -1, None)],
-)
-def test_trainer_with_gpus_options_combination_at_available_gpus_env(auto_select_gpus, gpus, expected_error):
-    if expected_error:
-        with pytest.raises(
-            expected_error,
-            match=re.escape(
-                "auto_select_gpus=True, gpus=0 is not a valid configuration."
-                " Please select a valid number of GPU resources when using auto_select_gpus."
-            ),
-        ):
-            Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
-    else:
-        Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
-
-
 @RunIf(min_gpus=2)
 @pytest.mark.parametrize(
     ["nb", "expected_gpu_idxs", "expected_error"],
@@ -59,3 +40,23 @@ def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error):
             pick_multiple_gpus(nb)
     else:
         assert expected_gpu_idxs == pick_multiple_gpus(nb)
+
+
+@mock.patch("torch.cuda.device_count", return_value=1)
+def test_pick_multiple_gpus_more_than_available(*_):
+    with pytest.raises(MisconfigurationException, match="You requested 3 GPUs but your machine only has 1 GPUs"):
+        pick_multiple_gpus(3)
+
+
+@mock.patch("torch.cuda.device_count", return_value=2)
+@mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1])
+def test_auto_select_gpus(*_):
+
+    trainer = Trainer(auto_select_gpus=True, accelerator="gpu", devices=1)
+    assert trainer.num_devices == 1
+    assert trainer.device_ids == [1]
+
+    trainer = Trainer(auto_select_gpus=True, gpus=1)
+
+    assert trainer.num_devices == 1
+    assert trainer.device_ids == [1]
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -1105,8 +1105,8 @@ def test_gpu_choice(tmpdir):
     num_gpus = torch.cuda.device_count()
     Trainer(**trainer_options, accelerator="gpu", devices=num_gpus, auto_select_gpus=True)
 
-    with pytest.raises(MisconfigurationException, match=r".*But your machine only has.*"):
-        Trainer(**trainer_options, accelerator="gpu", devices=num_gpus + 1, auto_select_gpus=True)
+    with pytest.raises(MisconfigurationException, match=r".*but your machine only has.*"):
+        Trainer(accelerator="gpu", devices=num_gpus + 1, auto_select_gpus=True)
 
 
 @pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.5, 5])