Skip to content

Commit 07c5d22

Browse files
cleaning up demos (#313)
* cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos * cleaning up demos
1 parent f7d7624 commit 07c5d22

File tree

9 files changed

+205
-110
lines changed

9 files changed

+205
-110
lines changed

examples/basic_examples/gpu_template.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,14 @@ def main(hparams):
5151
# gpu args
5252
parent_parser.add_argument(
5353
'--gpus',
54-
type=str,
55-
default='-1',
56-
help='any integer (number of GPUs to use) or -1 for all'
54+
type=int,
55+
default=2,
56+
help='how many gpus'
5757
)
5858
parent_parser.add_argument(
5959
'--distributed_backend',
6060
type=str,
61-
default=None,
61+
default='dp',
6262
help='supports three options dp, ddp, ddp2'
6363
)
6464
parent_parser.add_argument(

examples/basic_examples/lightning_module_template.py

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import torchvision.transforms as transforms
99
import torch
1010
import torch.nn.functional as F
11-
from test_tube import HyperOptArgumentParser
11+
from argparse import ArgumentParser
1212
from torch import optim
1313
from torch.utils.data import DataLoader
1414
from torch.utils.data.distributed import DistributedSampler
@@ -95,7 +95,7 @@ def training_step(self, batch, batch_idx):
9595
loss_val = self.loss(y, y_hat)
9696

9797
# in DP mode (default) make sure if result is scalar, there's another dim in the beginning
98-
if self.trainer.use_dp:
98+
if self.trainer.use_dp or self.trainer.use_ddp2:
9999
loss_val = loss_val.unsqueeze(0)
100100

101101
output = OrderedDict({
@@ -126,7 +126,7 @@ def validation_step(self, batch, batch_idx):
126126
val_acc = val_acc.cuda(loss_val.device.index)
127127

128128
# in DP mode (default) make sure if result is scalar, there's another dim in the beginning
129-
if self.trainer.use_dp:
129+
if self.trainer.use_dp or self.trainer.use_ddp2:
130130
loss_val = loss_val.unsqueeze(0)
131131
val_acc = val_acc.unsqueeze(0)
132132

@@ -168,7 +168,7 @@ def validation_end(self, outputs):
168168
val_loss_mean /= len(outputs)
169169
val_acc_mean /= len(outputs)
170170
tqdm_dict = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean}
171-
result = {'progress_bar': tqdm_dict}
171+
result = {'progress_bar': tqdm_dict, 'logs': tqdm_dict}
172172
return result
173173

174174
# ---------------------
@@ -190,20 +190,20 @@ def __dataloader(self, train):
190190
dataset = MNIST(root=self.hparams.data_root, train=train,
191191
transform=transform, download=True)
192192

193-
# when using multi-node (ddp) we need to add the datasampler
193+
# when using multi-node (ddp) we need to add the datasampler
194194
train_sampler = None
195195
batch_size = self.hparams.batch_size
196196

197197
if self.use_ddp:
198-
train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank)
199-
batch_size = batch_size // self.trainer.world_size # scale batch size
198+
train_sampler = DistributedSampler(dataset)
200199

201200
should_shuffle = train_sampler is None
202201
loader = DataLoader(
203202
dataset=dataset,
204203
batch_size=batch_size,
205204
shuffle=should_shuffle,
206-
sampler=train_sampler
205+
sampler=train_sampler,
206+
num_workers=0
207207
)
208208

209209
return loader
@@ -231,7 +231,7 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover
231231
:param root_dir:
232232
:return:
233233
"""
234-
parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser])
234+
parser = ArgumentParser(parents=[parent_parser])
235235

236236
# param overwrites
237237
# parser.set_defaults(gradient_clip_val=5.0)
@@ -241,21 +241,13 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover
241241
parser.add_argument('--out_features', default=10, type=int)
242242
# use 500 for CPU, 50000 for GPU to see speed difference
243243
parser.add_argument('--hidden_dim', default=50000, type=int)
244-
parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
245-
parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
246-
options=[0.0001, 0.0005, 0.001],
247-
tunable=True)
244+
parser.add_argument('--drop_prob', default=0.2, type=float)
245+
parser.add_argument('--learning_rate', default=0.001, type=float)
248246

249247
# data
250248
parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
251249

252250
# training params (opt)
253-
parser.opt_list('--optimizer_name', default='adam', type=str,
254-
options=['adam'], tunable=False)
255-
256-
# if using 2 nodes with 4 gpus each the batch size here
257-
# (256) will be 256 / (2*8) = 16 per gpu
258-
parser.opt_list('--batch_size', default=256 * 8, type=int,
259-
options=[32, 64, 128, 256], tunable=False,
260-
help='batch size will be divided over all gpus being used across all nodes')
251+
parser.add_argument('--optimizer_name', default='adam', type=str)
252+
parser.add_argument('--batch_size', default=64, type=int)
261253
return parser

examples/multi_node_examples/README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,17 @@ To run this demo which launches a single job that trains on 2 nodes (2 gpus per
44

55
1. Log into the jumphost node of your SLURM-managed cluster.
66
2. Create a conda environment with Lightning and a GPU PyTorch version.
7-
3. Submit this script.
7+
3. Choose a script to submit
8+
9+
#### DDP
10+
Submit this job to run with distributedDataParallel (2 nodes, 2 gpus each)
11+
```bash
12+
sbatch ddp_job_submit.sh YourEnv
13+
```
14+
15+
#### DDP2
16+
Submit this job to run with a different implementation of distributedDataParallel.
17+
In this version, each node acts like DataParallel but syncs across nodes like DDP.
818
```bash
9-
sbatch job_submit.sh --env=YourEnv
19+
sbatch ddp2_job_submit.sh YourEnv
1020
```
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash -l
2+
3+
# SLURM SUBMIT SCRIPT
4+
#SBATCH --nodes=2
5+
#SBATCH --gres=gpu:2
6+
#SBATCH --ntasks-per-node=1
7+
#SBATCH --mem=0
8+
#SBATCH --time=0-02:00:00
9+
10+
# activate conda env
11+
source activate $1
12+
13+
# -------------------------
14+
# debugging flags (optional)
15+
export NCCL_DEBUG=INFO
16+
export PYTHONFAULTHANDLER=1
17+
18+
# on your cluster you might need these:
19+
# set the network interface
20+
# export NCCL_SOCKET_IFNAME=^docker0,lo
21+
22+
# might need the latest cuda
23+
# module load NCCL/2.4.7-1-cuda.10.0
24+
# -------------------------
25+
26+
# run script from above
27+
srun python3 multi_node_ddp2_demo.py

examples/multi_node_examples/job_submit.sh renamed to examples/multi_node_examples/ddp_job_submit.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
#SBATCH --time=0-02:00:00
99

1010
# activate conda env
11-
source activate $env
11+
source activate $1
1212

1313
# -------------------------
1414
# debugging flags (optional)
15-
# export NCCL_DEBUG=INFO
16-
# export PYTHONFAULTHANDLER=1
15+
export NCCL_DEBUG=INFO
16+
export PYTHONFAULTHANDLER=1
1717

1818
# on your cluster you might need these:
1919
# set the network interface
@@ -24,4 +24,4 @@ source activate $env
2424
# -------------------------
2525

2626
# run script from above
27-
srun python multi_node_demo.py
27+
srun python3 multi_node_ddp_demo.py
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Multi-node example (GPU)
3+
"""
4+
import os
5+
import numpy as np
6+
import torch
7+
8+
from argparse import ArgumentParser
9+
from pytorch_lightning import Trainer
10+
from examples.basic_examples.lightning_module_template import LightningTemplateModel
11+
12+
SEED = 2334
13+
torch.manual_seed(SEED)
14+
np.random.seed(SEED)
15+
16+
17+
def main(hparams):
18+
"""
19+
Main training routine specific for this project
20+
:param hparams:
21+
:return:
22+
"""
23+
# ------------------------
24+
# 1 INIT LIGHTNING MODEL
25+
# ------------------------
26+
model = LightningTemplateModel(hparams)
27+
28+
# ------------------------
29+
# 2 INIT TRAINER
30+
# ------------------------
31+
trainer = Trainer(
32+
gpus=2,
33+
nb_gpu_nodes=2,
34+
distributed_backend='ddp2'
35+
)
36+
37+
# ------------------------
38+
# 3 START TRAINING
39+
# ------------------------
40+
trainer.fit(model)
41+
42+
43+
if __name__ == '__main__':
44+
45+
root_dir = os.path.dirname(os.path.realpath(__file__))
46+
parent_parser = ArgumentParser(add_help=False)
47+
48+
# each LightningModule defines arguments relevant to it
49+
parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
50+
hyperparams = parser.parse_args()
51+
52+
# ---------------------
53+
# RUN TRAINING
54+
# ---------------------
55+
main(hyperparams)

examples/multi_node_examples/multi_node_demo.py renamed to examples/multi_node_examples/multi_node_ddp_demo.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ def main(hparams):
3030
# ------------------------
3131
trainer = Trainer(
3232
gpus=2,
33-
nb_gpu_nodes=2
33+
nb_gpu_nodes=2,
34+
distributed_backend='ddp'
3435
)
3536

3637
# ------------------------

pytorch_lightning/root_module/memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def get_variable_sizes(self):
3535
out_sizes = []
3636
input_ = self.model.example_input_array
3737

38-
if self.model.use_ddp or self.model.use_dp or self.model.single_gpu:
38+
if self.model.on_gpu:
3939
input_ = input_.cuda(0)
4040

4141
if self.model.trainer.use_amp:

0 commit comments

Comments
 (0)