Torch Distributed Configurable backend + env var init (#26)

amogkam · web-flow · commit c13bbb19e949 · 2021-04-03T11:08:51.000-07:00
* support configurable torch distributed backends

* formatting

* use env var initialization

* re-enable examples tests

* fixes

* update classifier model

* update horovod test

* formatting

* update

* increase test timeout

* change back to tmpdir

* increase timeout more
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -22,7 +22,7 @@ jobs:
         ./format.sh --all
   test_linux_ray_master:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 25
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -49,7 +49,7 @@ jobs:
 
   test_linux_ray_master_examples:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -70,15 +70,15 @@ jobs:
       - name: Run Examples
         run: |
           pushd examples/
-          # echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
-          # echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
-          # echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
-          # echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
-          # echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
+          echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
+          echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
+          echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
+          echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
+          echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
 
   test_linux_ray_release:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 25
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -106,7 +106,7 @@ jobs:
 
   test_linux_ray_release_examples:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 15
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -127,8 +127,8 @@ jobs:
       - name: Run Examples
         run: |
           pushd examples/
-          # echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
-          # echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
-          # echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
-          # echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
-          # echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
+          echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
+          echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
+          echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
+          echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
+          echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
diff --git a/examples/ray_ddp_example.py b/examples/ray_ddp_example.py
@@ -10,9 +10,9 @@
 
 import ray
 from ray import tune
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 from ray_lightning.tune import TuneReportCallback
 from ray_lightning import RayPlugin
+from ray_lightning.tests.utils import LightningMNISTClassifier
 
 
 class MNISTClassifier(LightningMNISTClassifier):
diff --git a/examples/ray_ddp_tune.py b/examples/ray_ddp_tune.py
@@ -7,9 +7,9 @@
 import pytorch_lightning as pl
 import ray
 from ray import tune
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 from ray_lightning.tune import TuneReportCallback
 from ray_lightning import RayPlugin
+from ray_lightning.tests.utils import LightningMNISTClassifier
 
 
 def train_mnist(config,
diff --git a/examples/ray_horovod_example.py b/examples/ray_horovod_example.py
@@ -10,9 +10,9 @@
 
 import ray
 from ray import tune
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 from ray_lightning.tune import TuneReportCallback
 from ray_lightning import HorovodRayPlugin
+from ray_lightning.tests.utils import LightningMNISTClassifier
 
 
 class MNISTClassifier(LightningMNISTClassifier):
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -1,4 +1,4 @@
-from typing import Callable, Dict
+from typing import Callable, Dict, List
 
 import os
 from collections import defaultdict
@@ -7,7 +7,7 @@
 import torch
 from pytorch_lightning.plugins import DDPSpawnPlugin
 from pytorch_lightning import _logger as log, LightningModule
-from ray.util.sgd.torch.utils import setup_address
+from ray.util.sgd.utils import find_free_port
 
 from ray_lightning.session import init_session
 from ray_lightning.util import process_results, Queue
@@ -20,7 +20,15 @@ class RayExecutor:
 
     def set_env_var(self, key: str, value: str):
         """Set an environment variable with the provided values."""
-        os.environ[key] = value
+        if value is not None:
+            value = str(value)
+            os.environ[key] = value
+
+    def set_env_vars(self, keys: List[str], values: List[str]):
+        """Sets multiple env vars with the provided values"""
+        assert len(keys) == len(values)
+        for key, value in zip(keys, values):
+            self.set_env_var(key, value)
 
     def get_node_ip(self):
         """Returns the IP address of the node that this Ray actor is on."""
@@ -137,16 +145,19 @@ def start_training(self, trainer):
         revieve intermediate results, and process those results. Finally
         retrieve the training results from the rank 0 worker and return."""
 
-        if "PL_GLOBAL_SEED" in os.environ:
-            seed = os.environ["PL_GLOBAL_SEED"]
-            ray.get([
-                w.set_env_var.remote("PL_GLOBAL_SEED", seed)
-                for w in self.workers
-            ])
+        # Get rank 0 worker address and port for DDP connection.
+        os.environ["MASTER_ADDR"] = ray.get(
+            self.workers[0].get_node_ip.remote())
+        os.environ["MASTER_PORT"] = str(
+            ray.get(self.workers[0].execute.remote(find_free_port)))
 
-        # Get the rank 0 address for DDP connection.
-        self.ddp_address = ray.get(
-            self.workers[0].execute.remote(setup_address))
+        # Set environment variables for remote workers.
+        keys = [
+            "PL_GLOBAL_SEED", "PL_TORCH_DISTRIBUTED_BACKEND", "MASTER_ADDR",
+            "MASTER_PORT"
+        ]
+        values = [os.getenv(k) for k in keys]
+        ray.get([w.set_env_vars.remote(keys, values) for w in self.workers])
 
         self.global_to_local = self.get_local_ranks()
 
@@ -235,14 +246,15 @@ def init_ddp_connection(self,
                             world_size: int,
                             is_slurm_managing_tasks: bool = False) -> None:
         """Process group creation to be executed on each remote worker."""
-        torch_backend = "nccl" if self.use_gpu else "gloo"
+        torch_backend = os.getenv("PL_TORCH_DISTRIBUTED_BACKEND")
+        if torch_backend is None:
+            torch_backend = "nccl" if self.use_gpu else "gloo"
 
         if not torch.distributed.is_initialized():
             log.info(f"initializing ddp: GLOBAL_RANK: {global_rank}, MEMBER:"
                      f" {global_rank + 1}/{world_size}")
             torch.distributed.init_process_group(
                 backend=torch_backend,
-                init_method=self.ddp_address,
                 rank=global_rank,
                 world_size=world_size,
             )
diff --git a/ray_lightning/tests/test_ddp.py b/ray_lightning/tests/test_ddp.py
@@ -7,11 +7,10 @@
 from pytorch_lightning.callbacks import EarlyStopping
 
 import ray
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 
 from ray_lightning import RayPlugin
 from ray_lightning.tests.utils import get_trainer, train_test, \
-    load_test, predict_test, BoringModel
+    load_test, predict_test, BoringModel, LightningMNISTClassifier
 
 
 @pytest.fixture
@@ -95,7 +94,6 @@ def test_load(tmpdir, ray_start_2_cpus, num_workers):
     load_test(trainer, model)
 
 
-@pytest.mark.skip("Skip until next torchvision release.")
 @pytest.mark.parametrize("num_workers", [1, 2])
 def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
     """Tests if trained model has high accuracy on test set."""
@@ -105,12 +103,13 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers):
         "lr": 1e-2,
         "batch_size": 32,
     }
+
     model = LightningMNISTClassifier(config, tmpdir)
     dm = MNISTDataModule(
         data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
     plugin = RayPlugin(num_workers=num_workers, use_gpu=False)
     trainer = get_trainer(
-        tmpdir, limit_train_batches=10, max_epochs=1, plugins=[plugin])
+        tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin])
     predict_test(trainer, model, dm)
 
 
diff --git a/ray_lightning/tests/test_ddp_gpu.py b/ray_lightning/tests/test_ddp_gpu.py
@@ -7,11 +7,10 @@
 from pytorch_lightning import Callback
 
 import ray
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 
 from ray_lightning import RayPlugin
 from ray_lightning.tests.utils import get_trainer, train_test, BoringModel, \
-    predict_test
+    predict_test, LightningMNISTClassifier
 
 
 @pytest.fixture
@@ -54,7 +53,7 @@ def test_predict(tmpdir, ray_start_2_gpus, seed, num_workers):
     plugin = RayPlugin(num_workers=num_workers, use_gpu=True)
     trainer = get_trainer(
         tmpdir,
-        limit_train_batches=10,
+        limit_train_batches=20,
         max_epochs=1,
         plugins=[plugin],
         use_gpu=True)
diff --git a/ray_lightning/tests/test_horovod.py b/ray_lightning/tests/test_horovod.py
@@ -13,11 +13,10 @@
     HOROVOD_AVAILABLE = True
 
 import ray
-from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
 
 from ray_lightning import HorovodRayPlugin
 from ray_lightning.tests.utils import get_trainer, BoringModel, \
-    train_test, load_test, predict_test
+    train_test, load_test, predict_test, LightningMNISTClassifier
 
 
 def _nccl_available():
@@ -66,7 +65,6 @@ def test_load(tmpdir, ray_start_2_cpus, seed, num_slots):
     load_test(trainer, model)
 
 
-@pytest.mark.skip("Skip until next torchvision release.")
 @pytest.mark.parametrize("num_slots", [1, 2])
 def test_predict(tmpdir, ray_start_2_cpus, seed, num_slots):
     """Tests if trained model has high accuracy on test set."""
@@ -81,7 +79,7 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_slots):
         data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"])
     plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=False)
     trainer = get_trainer(
-        tmpdir, limit_train_batches=10, max_epochs=1, plugins=[plugin])
+        tmpdir, limit_train_batches=20, max_epochs=1, plugins=[plugin])
     predict_test(trainer, model, dm)
 
 
@@ -130,7 +128,7 @@ def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_slots):
     plugin = HorovodRayPlugin(num_slots=num_slots, use_gpu=True)
     trainer = get_trainer(
         tmpdir,
-        limit_train_batches=10,
+        limit_train_batches=20,
         max_epochs=1,
         plugins=[plugin],
         use_gpu=True)
diff --git a/ray_lightning/tests/utils.py b/ray_lightning/tests/utils.py
@@ -1,6 +1,8 @@
+import os
 from typing import Optional, List
 
 import torch
+import torch.nn.functional as F
 from pytorch_lightning.plugins import Plugin
 from torch.utils.data import Dataset
 
@@ -91,6 +93,58 @@ def on_load_checkpoint(self, checkpoint) -> None:
         self.val_epoch = checkpoint["val_epoch"]
 
 
+class LightningMNISTClassifier(pl.LightningModule):
+    def __init__(self, config, data_dir=None):
+        super(LightningMNISTClassifier, self).__init__()
+
+        self.data_dir = data_dir or os.getcwd()
+        self.lr = config["lr"]
+        layer_1, layer_2 = config["layer_1"], config["layer_2"]
+        self.batch_size = config["batch_size"]
+
+        # mnist images are (1, 28, 28) (channels, width, height)
+        self.layer_1 = torch.nn.Linear(28 * 28, layer_1)
+        self.layer_2 = torch.nn.Linear(layer_1, layer_2)
+        self.layer_3 = torch.nn.Linear(layer_2, 10)
+        self.accuracy = pl.metrics.Accuracy()
+
+    def forward(self, x):
+        batch_size, channels, width, height = x.size()
+        x = x.view(batch_size, -1)
+        x = self.layer_1(x)
+        x = torch.relu(x)
+        x = self.layer_2(x)
+        x = torch.relu(x)
+        x = self.layer_3(x)
+        x = F.softmax(x, dim=1)
+        return x
+
+    def configure_optimizers(self):
+        return torch.optim.Adam(self.parameters(), lr=self.lr)
+
+    def training_step(self, train_batch, batch_idx):
+        x, y = train_batch
+        logits = self.forward(x)
+        loss = F.nll_loss(logits, y)
+        acc = self.accuracy(logits, y)
+        self.log("ptl/train_loss", loss)
+        self.log("ptl/train_accuracy", acc)
+        return loss
+
+    def validation_step(self, val_batch, batch_idx):
+        x, y = val_batch
+        logits = self.forward(x)
+        loss = F.nll_loss(logits, y)
+        acc = self.accuracy(logits, y)
+        return {"val_loss": loss, "val_accuracy": acc}
+
+    def validation_epoch_end(self, outputs):
+        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
+        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
+        self.log("ptl/val_loss", avg_loss)
+        self.log("ptl/val_accuracy", avg_acc)
+
+
 def get_trainer(dir,
                 plugins: List[Plugin],
                 use_gpu: bool = False,
@@ -139,6 +193,7 @@ def predict_test(trainer: Trainer, model: LightningModule,
                  dm: LightningDataModule):
     """Checks if the trained model has high accuracy on the test set."""
     trainer.fit(model, datamodule=dm)
+    model = trainer.lightning_module
     dm.setup(stage="test")
     test_loader = dm.test_dataloader()
     acc = pl.metrics.Accuracy()