Skip to content

Commit 2187f30

Browse files
kaushikb11awaelchli
authored andcommitted
Support auto_select_gpus with accelerator and devices api (#12608)
Co-authored-by: Adrian Wälchli <[email protected]>
1 parent d95447a commit 2187f30

File tree

8 files changed

+57
-49
lines changed

8 files changed

+57
-49
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
9494
- Raise `MisconfigurationException` when the accelerator is available but the user passes invalid `([]/0/"0")` values to the `devices` flag ([#12708](https://github.com/PyTorchLightning/pytorch-lightning/pull/12708))
9595

9696

97+
- Support `auto_select_gpus` with the accelerator and devices API ([#12608](https://github.com/PyTorchLightning/pytorch-lightning/pull/12608))
98+
99+
97100
## [1.6.0] - 2022-03-29
98101

99102
### Added

pytorch_lightning/lite/lite.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
_update_dataloader,
3939
has_iterable_dataset,
4040
)
41-
from pytorch_lightning.utilities.device_parser import _parse_devices
4241
from pytorch_lightning.utilities.exceptions import MisconfigurationException
4342
from pytorch_lightning.utilities.seed import seed_everything
4443

@@ -80,7 +79,6 @@ def __init__(
8079
) -> None:
8180
self._check_accelerator_support(accelerator)
8281
self._check_strategy_support(strategy)
83-
gpu_ids, tpu_cores = _parse_devices(gpus=gpus, auto_select_gpus=False, tpu_cores=tpu_cores)
8482
self._accelerator_connector = AcceleratorConnector(
8583
num_processes=None,
8684
devices=devices,
@@ -89,7 +87,6 @@ def __init__(
8987
accelerator=accelerator,
9088
strategy=strategy,
9189
gpus=gpus,
92-
gpu_ids=gpu_ids,
9390
num_nodes=num_nodes,
9491
sync_batchnorm=False, # TODO: add support?
9592
benchmark=False,
@@ -99,6 +96,7 @@ def __init__(
9996
amp_type="native",
10097
amp_level=None,
10198
plugins=plugins,
99+
auto_select_gpus=False,
102100
)
103101
self._strategy = self._accelerator_connector.strategy
104102
self._accelerator = self._strategy.accelerator

pytorch_lightning/trainer/connectors/accelerator_connector.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
StrategyRegistry,
7070
TPUSpawnStrategy,
7171
)
72+
from pytorch_lightning.tuner.auto_gpu_select import pick_multiple_gpus
7273
from pytorch_lightning.utilities import (
7374
_StrategyType,
7475
AMPType,
@@ -102,11 +103,11 @@ def __init__(
102103
benchmark: Optional[bool] = None,
103104
replace_sampler_ddp: bool = True,
104105
deterministic: bool = False,
106+
auto_select_gpus: bool = False,
105107
num_processes: Optional[int] = None, # deprecated
106-
tpu_cores: Optional[Union[List[int], int]] = None, # deprecated
108+
tpu_cores: Optional[Union[List[int], str, int]] = None, # deprecated
107109
ipus: Optional[int] = None, # deprecated
108110
gpus: Optional[Union[List[int], str, int]] = None, # deprecated
109-
gpu_ids: Optional[List[int]] = None, # TODO can be removed
110111
) -> None:
111112
"""The AcceleratorConnector parses several Trainer arguments and instantiates the Strategy including other
112113
components such as the Accelerator and Precision plugins.
@@ -173,6 +174,7 @@ def __init__(
173174
self.checkpoint_io: Optional[CheckpointIO] = None
174175
self._amp_type_flag: Optional[LightningEnum] = None
175176
self._amp_level_flag: Optional[str] = amp_level
177+
self._auto_select_gpus: bool = auto_select_gpus
176178

177179
self._check_config_and_set_final_flags(
178180
strategy=strategy,
@@ -408,7 +410,7 @@ def _check_device_config_and_set_final_flags(
408410
num_processes: Optional[int],
409411
gpus: Optional[Union[List[int], str, int]],
410412
ipus: Optional[int],
411-
tpu_cores: Optional[Union[List[int], int]],
413+
tpu_cores: Optional[Union[List[int], str, int]],
412414
) -> None:
413415
self._num_nodes_flag = int(num_nodes) if num_nodes is not None else 1
414416
self._devices_flag = devices
@@ -521,6 +523,8 @@ def _set_parallel_devices_and_init_accelerator(self) -> None:
521523
self._gpus = self._devices_flag if not self._gpus else self._gpus
522524
self._tpu_cores = self._devices_flag if not self._tpu_cores else self._tpu_cores
523525

526+
self._set_devices_flag_if_auto_select_gpus_passed()
527+
524528
self._devices_flag = self.accelerator.parse_devices(self._devices_flag)
525529
if not self._parallel_devices:
526530
self._parallel_devices = self.accelerator.get_parallel_devices(self._devices_flag)
@@ -529,6 +533,11 @@ def _set_devices_flag_if_auto_passed(self) -> None:
529533
if self._devices_flag == "auto" or self._devices_flag is None:
530534
self._devices_flag = self.accelerator.auto_device_count()
531535

536+
def _set_devices_flag_if_auto_select_gpus_passed(self) -> None:
537+
if self._auto_select_gpus and isinstance(self._gpus, int) and isinstance(self.accelerator, GPUAccelerator):
538+
self._devices_flag = pick_multiple_gpus(self._gpus)
539+
log.info(f"Auto select gpus: {self._devices_flag}")
540+
532541
def _choose_and_init_cluster_environment(self) -> ClusterEnvironment:
533542
if isinstance(self._cluster_environment_flag, ClusterEnvironment):
534543
return self._cluster_environment_flag

pytorch_lightning/trainer/trainer.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from copy import deepcopy
2323
from datetime import timedelta
2424
from pathlib import Path
25-
from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Tuple, Type, Union
25+
from typing import Any, Callable, cast, Dict, Iterable, List, Optional, Type, Union
2626
from weakref import proxy
2727

2828
import torch
@@ -81,7 +81,6 @@
8181
_IPU_AVAILABLE,
8282
_TPU_AVAILABLE,
8383
AMPType,
84-
device_parser,
8584
GradClipAlgorithmType,
8685
parsing,
8786
)
@@ -225,7 +224,7 @@ def __init__(
225224
a power search or `binsearch` that estimates the batch size through a binary search.
226225
Default: ``False``.
227226
228-
auto_select_gpus: If enabled and ``gpus`` is an integer, pick available
227+
auto_select_gpus: If enabled and ``gpus`` or ``devices`` is an integer, pick available
229228
gpus automatically. This is especially useful when
230229
GPUs are configured to be in "exclusive mode", such
231230
that only one process at a time can access them.
@@ -478,8 +477,6 @@ def __init__(
478477
log.detail(f"{self.__class__.__name__}: Initializing trainer with parameters: {locals()}")
479478
self.state = TrainerState()
480479

481-
gpu_ids, tpu_cores = self._parse_devices(gpus, auto_select_gpus, tpu_cores)
482-
483480
# init connectors
484481
self._data_connector = DataConnector(self, multiple_trainloader_mode)
485482

@@ -491,12 +488,12 @@ def __init__(
491488
accelerator=accelerator,
492489
strategy=strategy,
493490
gpus=gpus,
494-
gpu_ids=gpu_ids,
495491
num_nodes=num_nodes,
496492
sync_batchnorm=sync_batchnorm,
497493
benchmark=benchmark,
498494
replace_sampler_ddp=replace_sampler_ddp,
499495
deterministic=deterministic,
496+
auto_select_gpus=auto_select_gpus,
500497
precision=precision,
501498
amp_type=amp_backend,
502499
amp_level=amp_level,
@@ -1770,14 +1767,6 @@ def _call_strategy_hook(
17701767

17711768
return output
17721769

1773-
@staticmethod
1774-
def _parse_devices(
1775-
gpus: Optional[Union[List[int], str, int]],
1776-
auto_select_gpus: bool,
1777-
tpu_cores: Optional[Union[List[int], str, int]],
1778-
) -> Tuple[Optional[List[int]], Optional[Union[List[int], int]]]:
1779-
return device_parser._parse_devices(gpus, auto_select_gpus, tpu_cores)
1780-
17811770
@staticmethod
17821771
def _log_api_event(event: str) -> None:
17831772
torch._C._log_api_usage_once("lightning.trainer." + event)

pytorch_lightning/tuner/auto_gpu_select.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,19 @@ def pick_multiple_gpus(nb: int) -> List[int]:
2222
"""
2323
Raises:
2424
MisconfigurationException:
25-
If ``gpus`` is set to 0, when ``auto_select_gpus=True``.
25+
If ``gpus`` or ``devices`` is set to 0, when ``auto_select_gpus=True``, or when the requested number is
26+
higher than the number of GPUs available on the machine.
2627
"""
2728
if nb == 0:
2829
raise MisconfigurationException(
2930
"auto_select_gpus=True, gpus=0 is not a valid configuration."
3031
" Please select a valid number of GPU resources when using auto_select_gpus."
3132
)
3233

33-
nb = torch.cuda.device_count() if nb == -1 else nb
34+
num_gpus = torch.cuda.device_count()
35+
if nb > num_gpus:
36+
raise MisconfigurationException(f"You requested {nb} GPUs but your machine only has {num_gpus} GPUs.")
37+
nb = num_gpus if nb == -1 else nb
3438

3539
picked: List[int] = []
3640
for _ in range(nb):

tests/accelerators/test_accelerator_connector.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -494,15 +494,19 @@ def test_accelerator_cpu(_):
494494
trainer = Trainer(accelerator="cpu")
495495
assert isinstance(trainer.accelerator, CPUAccelerator)
496496

497-
with pytest.raises(MisconfigurationException, match="You requested gpu:"):
498-
trainer = Trainer(gpus=1)
499497
with pytest.raises(
500498
MisconfigurationException,
501499
match="GPUAccelerator can not run on your system since the accelerator is not available.",
502500
):
503-
trainer = Trainer(accelerator="gpu")
504-
with pytest.raises(MisconfigurationException, match="You requested gpu:"):
505-
trainer = Trainer(accelerator="cpu", gpus=1)
501+
with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"):
502+
Trainer(gpus=1)
503+
with pytest.raises(
504+
MisconfigurationException,
505+
match="GPUAccelerator can not run on your system since the accelerator is not available.",
506+
):
507+
Trainer(accelerator="gpu")
508+
509+
Trainer(accelerator="cpu", gpus=1)
506510

507511

508512
@RunIf(min_gpus=1)

tests/trainer/properties/test_auto_gpu_select.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import re
15+
from unittest import mock
1516

1617
import pytest
1718
import torch
@@ -22,26 +23,6 @@
2223
from tests.helpers.runif import RunIf
2324

2425

25-
# TODO: add pytest.deprecated_call @daniellepintz
26-
@RunIf(min_gpus=2)
27-
@pytest.mark.parametrize(
28-
["auto_select_gpus", "gpus", "expected_error"],
29-
[(True, 0, MisconfigurationException), (True, -1, None), (False, 0, None), (False, -1, None)],
30-
)
31-
def test_trainer_with_gpus_options_combination_at_available_gpus_env(auto_select_gpus, gpus, expected_error):
32-
if expected_error:
33-
with pytest.raises(
34-
expected_error,
35-
match=re.escape(
36-
"auto_select_gpus=True, gpus=0 is not a valid configuration."
37-
" Please select a valid number of GPU resources when using auto_select_gpus."
38-
),
39-
):
40-
Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
41-
else:
42-
Trainer(auto_select_gpus=auto_select_gpus, gpus=gpus)
43-
44-
4526
@RunIf(min_gpus=2)
4627
@pytest.mark.parametrize(
4728
["nb", "expected_gpu_idxs", "expected_error"],
@@ -59,3 +40,23 @@ def test_pick_multiple_gpus(nb, expected_gpu_idxs, expected_error):
5940
pick_multiple_gpus(nb)
6041
else:
6142
assert expected_gpu_idxs == pick_multiple_gpus(nb)
43+
44+
45+
@mock.patch("torch.cuda.device_count", return_value=1)
46+
def test_pick_multiple_gpus_more_than_available(*_):
47+
with pytest.raises(MisconfigurationException, match="You requested 3 GPUs but your machine only has 1 GPUs"):
48+
pick_multiple_gpus(3)
49+
50+
51+
@mock.patch("torch.cuda.device_count", return_value=2)
52+
@mock.patch("pytorch_lightning.trainer.connectors.accelerator_connector.pick_multiple_gpus", return_value=[1])
53+
def test_auto_select_gpus(*_):
54+
55+
trainer = Trainer(auto_select_gpus=True, accelerator="gpu", devices=1)
56+
assert trainer.num_devices == 1
57+
assert trainer.device_ids == [1]
58+
59+
trainer = Trainer(auto_select_gpus=True, gpus=1)
60+
61+
assert trainer.num_devices == 1
62+
assert trainer.device_ids == [1]

tests/trainer/test_trainer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,8 +1105,8 @@ def test_gpu_choice(tmpdir):
11051105
num_gpus = torch.cuda.device_count()
11061106
Trainer(**trainer_options, accelerator="gpu", devices=num_gpus, auto_select_gpus=True)
11071107

1108-
with pytest.raises(MisconfigurationException, match=r".*But your machine only has.*"):
1109-
Trainer(**trainer_options, accelerator="gpu", devices=num_gpus + 1, auto_select_gpus=True)
1108+
with pytest.raises(MisconfigurationException, match=r".*but your machine only has.*"):
1109+
Trainer(accelerator="gpu", devices=num_gpus + 1, auto_select_gpus=True)
11101110

11111111

11121112
@pytest.mark.parametrize("limit_val_batches", [0.0, 1, 1.0, 0.5, 5])

0 commit comments

Comments
 (0)