Skip to content
This repository was archived by the owner on Nov 3, 2023. It is now read-only.

Commit c3b13a7

Browse files
authored
Tune improvements and Cleanup (#69)
* wip * update * updates * add test * try * add back * fix
1 parent 4dcbdf0 commit c3b13a7

File tree

8 files changed

+79
-24
lines changed

8 files changed

+79
-24
lines changed

.github/workflows/test.yaml

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ jobs:
3737
python -m pip install codecov
3838
python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
3939
if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
40-
HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
4140
- name: Install package
4241
run: |
4342
python -m pip install -e .
@@ -124,7 +123,6 @@ jobs:
124123
python -m pip install codecov
125124
python -m pip install -U ray
126125
if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
127-
HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git
128126
- name: Install package
129127
run: |
130128
python -m pip install -e .
@@ -193,4 +191,35 @@ jobs:
193191
pushd ray_lightning/tests
194192
echo "running examples with Ray Client 1" && python -m pytest -v --durations=0 -x test_client.py
195193
echo "running examples with Ray Client 2" && python -m pytest -v --durations=0 -x test_client_2.py
196-
echo "running examples with Ray Client 3" && python -m pytest -v --durations=0 -x test_client_3.py
194+
echo "running examples with Ray Client 3" && python -m pytest -v --durations=0 -x test_client_3.py
195+
196+
test_linux_compat:
197+
# Test compatibility when optional libraries are not installed.
198+
runs-on: ubuntu-latest
199+
timeout-minutes: 40
200+
steps:
201+
- uses: actions/checkout@v2
202+
- name: Set up Python 3.7
203+
uses: actions/setup-python@v2
204+
with:
205+
python-version: 3.7
206+
- name: Install dependencies
207+
run: |
208+
python -m pip install --upgrade pip
209+
python -m pip install --upgrade setuptools
210+
python -m pip install codecov
211+
python -m pip install -U ray
212+
if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
213+
HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git
214+
- name: Uninstall unavailable dependencies
215+
run: |
216+
# Uninstall Tune
217+
pip uninstall -y tabulate
218+
- name: Install package
219+
run: |
220+
python -m pip install -e .
221+
- name: Test with Pytest
222+
run: |
223+
pushd ray_lightning/tests
224+
python -m pytest -v --durations=0 -x test_ddp.py
225+
python -m pytest -v --durations=0 -x test_horovod.py

README.md

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,10 @@ You no longer have to set environment variables or configurations and run your t
8686
Example using `ray_lightning` with Tune:
8787

8888
```python
89+
from ray import tune
90+
8991
from ray_lightning import RayPlugin
90-
from ray_lightning.tune import TuneReportCallback
92+
from ray_lightning.tune import TuneReportCallback, get_tune_ddp_resources
9193

9294
def train_mnist(config):
9395

@@ -111,17 +113,14 @@ config = {
111113
"batch_size": tune.choice([32, 64, 128]),
112114
}
113115

114-
# Make sure to specify how many actors each training run will create via the "extra_cpu" field.
116+
# Make sure to pass in ``resources_per_trial`` using the ``get_tune_ddp_resources`` utility.
115117
analysis = tune.run(
116118
train_mnist,
117119
metric="loss",
118120
mode="min",
119121
config=config,
120122
num_samples=num_samples,
121-
resources_per_trial={
122-
"cpu": 1,
123-
"extra_cpu": 4
124-
},
123+
resources_per_trial=get_tune_ddp_resources(num_workers=4),
125124
name="tune_mnist")
126125

127126
print("Best hyperparameters found were: ", analysis.best_config)

ray_lightning/examples/ray_ddp_example.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import ray
1212
from ray import tune
13-
from ray_lightning.tune import TuneReportCallback
13+
from ray_lightning.tune import TuneReportCallback, get_tune_ddp_resources
1414
from ray_lightning import RayPlugin
1515
from ray_lightning.tests.utils import LightningMNISTClassifier
1616

@@ -108,11 +108,8 @@ def tune_mnist(data_dir,
108108
mode="min",
109109
config=config,
110110
num_samples=num_samples,
111-
resources_per_trial={
112-
"cpu": 1,
113-
"extra_cpu": num_workers,
114-
"extra_gpu": num_workers * int(use_gpu)
115-
},
111+
resources_per_trial=get_tune_ddp_resources(
112+
num_workers=num_workers, use_gpu=use_gpu),
116113
name="tune_mnist")
117114

118115
print("Best hyperparameters found were: ", analysis.best_config)

ray_lightning/examples/ray_ddp_tune.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pytorch_lightning as pl
88
import ray
99
from ray import tune
10-
from ray_lightning.tune import TuneReportCallback
10+
from ray_lightning.tune import TuneReportCallback, get_tune_ddp_resources
1111
from ray_lightning import RayPlugin
1212
from ray_lightning.tests.utils import LightningMNISTClassifier
1313

@@ -71,11 +71,8 @@ def tune_mnist(data_dir,
7171
mode="min",
7272
config=config,
7373
num_samples=num_samples,
74-
resources_per_trial={
75-
"cpu": 1,
76-
"extra_cpu": num_workers,
77-
"extra_gpu": num_workers * int(use_gpu)
78-
},
74+
resources_per_trial=get_tune_ddp_resources(
75+
num_workers=num_workers, use_gpu=use_gpu),
7976
name="tune_mnist")
8077

8178
print("Best hyperparameters found were: ", analysis.best_config)

ray_lightning/ray_ddp.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import io
2+
import socket
3+
from contextlib import closing
24
from typing import Callable, Dict, List, Union, Any
35

46
import os
@@ -12,7 +14,6 @@
1214
from pytorch_lightning.utilities import rank_zero_only
1315

1416
import ray
15-
from ray.util.sgd.utils import find_free_port
1617
from ray.util.queue import Queue
1718

1819
from ray_lightning.session import init_session
@@ -21,6 +22,13 @@
2122
from ray_lightning.ray_environment import RayEnvironment
2223

2324

25+
def find_free_port():
26+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
27+
s.bind(("", 0))
28+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
29+
return s.getsockname()[1]
30+
31+
2432
@ray.remote
2533
class RayExecutor:
2634
"""A class to execute any arbitrary function remotely."""
@@ -39,7 +47,7 @@ def set_env_vars(self, keys: List[str], values: List[str]):
3947

4048
def get_node_ip(self):
4149
"""Returns the IP address of the node that this Ray actor is on."""
42-
return ray.services.get_node_ip_address()
50+
return ray.util.get_node_ip_address()
4351

4452
def execute(self, fn: Callable, *args, **kwargs):
4553
"""Execute the provided function and return the result."""

ray_lightning/ray_horovod.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def __init__(self,
7474
num_hosts: int = 1,
7575
num_slots: int = 1,
7676
use_gpu: bool = False):
77+
7778
if not HOROVOD_AVAILABLE:
7879
raise RuntimeError("Please intall Horovod to use this plugin.")
7980
if not ray.is_initialized():

ray_lightning/tune.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from pytorch_lightning import Trainer, LightningModule
77

88
from ray_lightning.session import put_queue, get_actor_rank
9+
from ray_lightning.util import Unavailable
910

1011
try:
1112
from ray import tune
@@ -14,15 +15,31 @@
1415
TUNE_INSTALLED = True
1516
except ImportError:
1617
tune = None
17-
TuneCallback = object
18+
TuneCallback = Unavailable
1819

1920
def is_session_enabled():
2021
return False
2122

23+
get_tune_ddp_resources = Unavailable
24+
2225
TUNE_INSTALLED = False
2326

2427
if TUNE_INSTALLED:
2528

29+
def get_tune_ddp_resources(num_workers: int = 1,
30+
cpus_per_worker: int = 1,
31+
use_gpu: bool = False) -> Dict[str, int]:
32+
"""Returns the PlacementGroupFactory to use for Ray Tune."""
33+
from ray.tune import PlacementGroupFactory
34+
35+
head_bundle = {"CPU": 1}
36+
child_bundle = {"CPU": cpus_per_worker, "GPU": int(use_gpu)}
37+
child_bundles = [child_bundle.copy() for _ in range(num_workers)]
38+
bundles = [head_bundle] + child_bundles
39+
placement_group_factory = PlacementGroupFactory(
40+
bundles, strategy="PACK")
41+
return placement_group_factory
42+
2643
class TuneReportCallback(TuneCallback):
2744
"""Distributed PyTorch Lightning to Ray Tune reporting callback
2845
@@ -197,3 +214,8 @@ def __init__(
197214
def _handle(self, trainer: Trainer, pl_module: LightningModule):
198215
self._checkpoint._handle(trainer, pl_module)
199216
self._report._handle(trainer, pl_module)
217+
218+
else:
219+
# If Tune is not installed.
220+
TuneReportCallback = Unavailable
221+
TuneReportCheckpointCallback = Unavailable

ray_lightning/util.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def setup(self, trainer: Trainer, model: LightningModule) -> None:
1717
return super(GPUAccelerator, self).setup(trainer, model)
1818

1919
def on_train_start(self) -> None:
20+
if "cuda" not in str(self.root_device):
21+
raise RuntimeError("GPUs were requested but are not available.")
2022
super(DelayedGPUAccelerator, self).on_train_start()
2123

2224

0 commit comments

Comments
 (0)