cleaning up demos (#313)

williamFalcon · web-flow · commit 07c5d22ae3db · 2019-10-05T16:39:05.000-04:00
* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos

* cleaning up demos
diff --git a/examples/basic_examples/gpu_template.py b/examples/basic_examples/gpu_template.py
@@ -51,14 +51,14 @@ def main(hparams):
     # gpu args
     parent_parser.add_argument(
         '--gpus',
-        type=str,
-        default='-1',
-        help='any integer (number of GPUs to use) or -1 for all'
+        type=int,
+        default=2,
+        help='how many gpus'
     )
     parent_parser.add_argument(
         '--distributed_backend',
         type=str,
-        default=None,
+        default='dp',
         help='supports three options dp, ddp, ddp2'
     )
     parent_parser.add_argument(
diff --git a/examples/basic_examples/lightning_module_template.py b/examples/basic_examples/lightning_module_template.py
@@ -8,7 +8,7 @@
 import torchvision.transforms as transforms
 import torch
 import torch.nn.functional as F
-from test_tube import HyperOptArgumentParser
+from argparse import ArgumentParser
 from torch import optim
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
@@ -95,7 +95,7 @@ def training_step(self, batch, batch_idx):
         loss_val = self.loss(y, y_hat)
 
         # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
-        if self.trainer.use_dp:
+        if self.trainer.use_dp or self.trainer.use_ddp2:
             loss_val = loss_val.unsqueeze(0)
 
         output = OrderedDict({
@@ -126,7 +126,7 @@ def validation_step(self, batch, batch_idx):
             val_acc = val_acc.cuda(loss_val.device.index)
 
         # in DP mode (default) make sure if result is scalar, there's another dim in the beginning
-        if self.trainer.use_dp:
+        if self.trainer.use_dp or self.trainer.use_ddp2:
             loss_val = loss_val.unsqueeze(0)
             val_acc = val_acc.unsqueeze(0)
 
@@ -168,7 +168,7 @@ def validation_end(self, outputs):
         val_loss_mean /= len(outputs)
         val_acc_mean /= len(outputs)
         tqdm_dict = {'val_loss': val_loss_mean, 'val_acc': val_acc_mean}
-        result = {'progress_bar': tqdm_dict}
+        result = {'progress_bar': tqdm_dict, 'logs': tqdm_dict}
         return result
 
     # ---------------------
@@ -190,20 +190,20 @@ def __dataloader(self, train):
         dataset = MNIST(root=self.hparams.data_root, train=train,
                         transform=transform, download=True)
 
-        # when using multi-node (ddp) we need to add the datasampler
+        # when using multi-node (ddp) we need to add the  datasampler
         train_sampler = None
         batch_size = self.hparams.batch_size
 
         if self.use_ddp:
-            train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank)
-            batch_size = batch_size // self.trainer.world_size  # scale batch size
+            train_sampler = DistributedSampler(dataset)
 
         should_shuffle = train_sampler is None
         loader = DataLoader(
             dataset=dataset,
             batch_size=batch_size,
             shuffle=should_shuffle,
-            sampler=train_sampler
+            sampler=train_sampler,
+            num_workers=0
         )
 
         return loader
@@ -231,7 +231,7 @@ def add_model_specific_args(parent_parser, root_dir):  # pragma: no cover
         :param root_dir:
         :return:
         """
-        parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser])
+        parser = ArgumentParser(parents=[parent_parser])
 
         # param overwrites
         # parser.set_defaults(gradient_clip_val=5.0)
@@ -241,21 +241,13 @@ def add_model_specific_args(parent_parser, root_dir):  # pragma: no cover
         parser.add_argument('--out_features', default=10, type=int)
         # use 500 for CPU, 50000 for GPU to see speed difference
         parser.add_argument('--hidden_dim', default=50000, type=int)
-        parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
-        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
-                        options=[0.0001, 0.0005, 0.001],
-                        tunable=True)
+        parser.add_argument('--drop_prob', default=0.2, type=float)
+        parser.add_argument('--learning_rate', default=0.001, type=float)
 
         # data
         parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
 
         # training params (opt)
-        parser.opt_list('--optimizer_name', default='adam', type=str,
-                        options=['adam'], tunable=False)
-
-        # if using 2 nodes with 4 gpus each the batch size here
-        #  (256) will be 256 / (2*8) = 16 per gpu
-        parser.opt_list('--batch_size', default=256 * 8, type=int,
-                        options=[32, 64, 128, 256], tunable=False,
-                        help='batch size will be divided over all gpus being used across all nodes')
+        parser.add_argument('--optimizer_name', default='adam', type=str)
+        parser.add_argument('--batch_size', default=64, type=int)
         return parser
diff --git a/examples/multi_node_examples/README.md b/examples/multi_node_examples/README.md
@@ -4,7 +4,17 @@ To run this demo which launches a single job that trains on 2 nodes (2 gpus per
 
 1. Log into the jumphost node of your SLURM-managed cluster.  
 2. Create a conda environment with Lightning and a GPU PyTorch version.   
-3. Submit this script.   
+3. Choose a script to submit    
+
+#### DDP  
+Submit this job to run with distributedDataParallel (2 nodes, 2 gpus each)
+```bash
+sbatch ddp_job_submit.sh YourEnv
+```
+
+#### DDP2  
+Submit this job to run with a different implementation of distributedDataParallel.
+In this version, each node acts like DataParallel but syncs across nodes like DDP.
 ```bash
-sbatch job_submit.sh --env=YourEnv
+sbatch ddp2_job_submit.sh YourEnv
 ```
diff --git a/examples/multi_node_examples/ddp2_job_submit.sh b/examples/multi_node_examples/ddp2_job_submit.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -l
+
+# SLURM SUBMIT SCRIPT
+#SBATCH --nodes=2
+#SBATCH --gres=gpu:2
+#SBATCH --ntasks-per-node=1
+#SBATCH --mem=0
+#SBATCH --time=0-02:00:00
+
+# activate conda env
+source activate $1
+
+# -------------------------
+# debugging flags (optional)
+ export NCCL_DEBUG=INFO
+ export PYTHONFAULTHANDLER=1
+
+# on your cluster you might need these:
+# set the network interface
+# export NCCL_SOCKET_IFNAME=^docker0,lo
+
+# might need the latest cuda
+# module load NCCL/2.4.7-1-cuda.10.0
+# -------------------------
+
+# run script from above
+srun python3 multi_node_ddp2_demo.py
diff --git a/examples/multi_node_examples/ddp_job_submit.sh b/examples/multi_node_examples/ddp_job_submit.sh
@@ -8,12 +8,12 @@
 #SBATCH --time=0-02:00:00
 
 # activate conda env
-source activate $env
+source activate $1
 
 # -------------------------
 # debugging flags (optional)
-# export NCCL_DEBUG=INFO
-# export PYTHONFAULTHANDLER=1
+ export NCCL_DEBUG=INFO
+ export PYTHONFAULTHANDLER=1
 
 # on your cluster you might need these:
 # set the network interface
@@ -24,4 +24,4 @@ source activate $env
 # -------------------------
 
 # run script from above
-srun python multi_node_demo.py
+srun python3 multi_node_ddp_demo.py
diff --git a/examples/multi_node_examples/multi_node_ddp2_demo.py b/examples/multi_node_examples/multi_node_ddp2_demo.py
@@ -0,0 +1,55 @@
+"""
+Multi-node example (GPU)
+"""
+import os
+import numpy as np
+import torch
+
+from argparse import ArgumentParser
+from pytorch_lightning import Trainer
+from examples.basic_examples.lightning_module_template import LightningTemplateModel
+
+SEED = 2334
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+
+
+def main(hparams):
+    """
+    Main training routine specific for this project
+    :param hparams:
+    :return:
+    """
+    # ------------------------
+    # 1 INIT LIGHTNING MODEL
+    # ------------------------
+    model = LightningTemplateModel(hparams)
+
+    # ------------------------
+    # 2 INIT TRAINER
+    # ------------------------
+    trainer = Trainer(
+        gpus=2,
+        nb_gpu_nodes=2,
+        distributed_backend='ddp2'
+    )
+
+    # ------------------------
+    # 3 START TRAINING
+    # ------------------------
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+
+    root_dir = os.path.dirname(os.path.realpath(__file__))
+    parent_parser = ArgumentParser(add_help=False)
+
+    # each LightningModule defines arguments relevant to it
+    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
+    hyperparams = parser.parse_args()
+
+    # ---------------------
+    # RUN TRAINING
+    # ---------------------
+    main(hyperparams)
diff --git a/examples/multi_node_examples/multi_node_ddp_demo.py b/examples/multi_node_examples/multi_node_ddp_demo.py
@@ -30,7 +30,8 @@ def main(hparams):
     # ------------------------
     trainer = Trainer(
         gpus=2,
-        nb_gpu_nodes=2
+        nb_gpu_nodes=2,
+        distributed_backend='ddp'
     )
 
     # ------------------------
diff --git a/pytorch_lightning/root_module/memory.py b/pytorch_lightning/root_module/memory.py
@@ -35,7 +35,7 @@ def get_variable_sizes(self):
         out_sizes = []
         input_ = self.model.example_input_array
 
-        if self.model.use_ddp or self.model.use_dp or self.model.single_gpu:
+        if self.model.on_gpu:
             input_ = input_.cuda(0)
 
         if self.model.trainer.use_amp:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,8 @@ def main(hparams):`
`30`	`30`	`# ------------------------`
`31`	`31`	`trainer = Trainer(`
`32`	`32`	`gpus=2,`
`33`		`- nb_gpu_nodes=2`
	`33`	`+ nb_gpu_nodes=2,`
	`34`	`+ distributed_backend='ddp'`
`34`	`35`	`)`
`35`	`36`
`36`	`37`	`# ------------------------`