Support PTL 1.4 (#58)

amogkam · web-flow · commit 41e34912ece0 · 2021-08-09T21:17:18.000-07:00
* support 1.3.8

* increase timeout

* upgrade

* fix failing test

* split test suite

* fix

* bump up timeouts more
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,9 +20,10 @@ jobs:
     - name: Run format script
       run: |
         ./format.sh --all
-  test_linux_ray_master:
+
+  test_linux_ray_master_1:
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 40
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -44,13 +45,37 @@ jobs:
         run: |
           pushd ray_lightning/tests
           python -m pytest -v --durations=0 -x test_ddp.py
+          python -m pytest -v --durations=0 -x test_ddp_sharded.py
+
+  test_linux_ray_master_2:
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools
+          python -m pip install codecov
+          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+          if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
+          HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
+      - name: Install package
+        run: |
+          python -m pip install -e .
+      - name: Test with Pytest
+        run: |
+          pushd ray_lightning/tests
           python -m pytest -v --durations=0 -x test_horovod.py
           python -m pytest -v --durations=0 -x test_tune.py
-          python -m pytest -v --durations=0 -x test_ddp_sharded.py
 
   test_linux_ray_master_examples:
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 40
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -83,9 +108,9 @@ jobs:
           echo "running examples with Ray Client 3" && python -m pytest -v --durations=0 -x test_client_3.py
 
 
-  test_linux_ray_release:
+  test_linux_ray_release_1:
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 40
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
@@ -107,14 +132,38 @@ jobs:
         run: |
           pushd ray_lightning/tests
           python -m pytest -v --durations=0 -x test_ddp.py
+          python -m pytest -v --durations=0 -x test_ddp_sharded.py
+
+  test_linux_ray_release_2:
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install --upgrade setuptools
+          python -m pip install codecov
+          python -m pip install -U ray
+          if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
+          HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git
+      - name: Install package
+        run: |
+          python -m pip install -e .
+      - name: Test with Pytest
+        run: |
+          pushd ray_lightning/tests
           python -m pytest -v --durations=0 -x test_horovod.py
           python -m pytest -v --durations=0 -x test_tune.py
-          python -m pytest -v --durations=0 -x test_ddp_sharded.py
 
 
   test_linux_ray_release_examples:
     runs-on: ubuntu-latest
-    timeout-minutes: 25
+    timeout-minutes: 40
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python 3.7
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -216,7 +216,7 @@ def start_training(self, trainer):
         trainer.optimizers = []
         return results
 
-    def start_testing(self, trainer):
+    def start_evaluating(self, trainer):
         results = self.execution_loop(trainer, tune_enabled=False)
         return results
 
diff --git a/ray_lightning/ray_ddp_sharded.py b/ray_lightning/ray_ddp_sharded.py
@@ -1,66 +1,7 @@
-from typing import Optional
-
-import torch
-from torch.optim import Optimizer
-
-from pytorch_lightning import LightningModule
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
+from pytorch_lightning.plugins import DDPSpawnShardedPlugin
 
 from ray_lightning import RayPlugin
 
-if _FAIRSCALE_AVAILABLE:
-    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
-    from fairscale.optim import OSS
-
-    from pytorch_lightning.overrides.fairscale import \
-        LightningShardedDataParallel, unwrap_lightning_module_sharded
-
-
-class RayShardedPlugin(RayPlugin):
-    def configure_ddp(self):
-        self._wrap_optimizers()
-        self._model = ShardedDataParallel(
-            LightningShardedDataParallel(self.model),
-            sharded_optimizer=self.lightning_module.trainer.optimizers)
-        setattr(self._model, "require_backward_grad_sync", False)
-
-    def _reinit_optimizers_with_oss(self):
-        optimizers = self.lightning_module.trainer.optimizers
-        for x, optimizer in enumerate(optimizers):
-            if not isinstance(optimizer, OSS):
-                optim_class = type(optimizer)
-                zero_optimizer = OSS(
-                    params=optimizer.param_groups,
-                    optim=optim_class,
-                    **optimizer.defaults)
-                optimizers[x] = zero_optimizer
-                del optimizer
-        trainer = self.lightning_module.trainer
-        trainer.optimizers = optimizers
-
-    def _wrap_optimizers(self):
-        trainer = self.model.trainer
-        if trainer.testing:
-            return
-        self._reinit_optimizers_with_oss()
-
-    def optimizer_state(self, optimizer: "OSS") -> Optional[dict]:
-        if isinstance(optimizer, OSS):
-            optimizer.consolidate_state_dict()
-        return self._optim_state_dict(optimizer)
-
-    @rank_zero_only
-    def _optim_state_dict(self, optimizer):
-        """Retrieves state dict only on rank 0."""
-        return optimizer.state_dict()
-
-    @property
-    def lightning_module(self) -> LightningModule:
-        return unwrap_lightning_module_sharded(self._model)
-
-    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool,
-                     optimizer: Optimizer, opt_idx: int):
-        pass
 
-    def post_training_step(self):
-        pass
+class RayShardedPlugin(RayPlugin, DDPSpawnShardedPlugin):
+    pass
diff --git a/ray_lightning/tests/test_ddp.py b/ray_lightning/tests/test_ddp.py
@@ -154,19 +154,22 @@ def test_early_stop(tmpdir, ray_start_2_cpus):
     """Tests if early stopping callback works correctly."""
     model = BoringModel()
     plugin = RayPlugin(num_workers=1, use_gpu=False)
-    early_stop = EarlyStopping(monitor="val_loss", patience=2, verbose=True)
+    patience = 2
+    early_stop = EarlyStopping(
+        monitor="val_loss", patience=patience, verbose=True)
     trainer = get_trainer(
         tmpdir,
         max_epochs=500,
         plugins=[plugin],
         callbacks=[early_stop],
+        num_sanity_val_steps=0,
         limit_train_batches=1.0,
         limit_val_batches=1.0,
         progress_bar_refresh_rate=1)
     trainer.fit(model)
     trained_model = BoringModel.load_from_checkpoint(
         trainer.checkpoint_callback.best_model_path)
-    assert trained_model.val_epoch == 2, trained_model.val_epoch
+    assert trained_model.val_epoch == patience + 1, trained_model.val_epoch
 
 
 def test_unused_parameters(tmpdir, ray_start_2_cpus):
diff --git a/ray_lightning/tests/utils.py b/ray_lightning/tests/utils.py
@@ -153,30 +153,32 @@ def get_trainer(dir,
                 limit_val_batches: int = 10,
                 progress_bar_refresh_rate: int = 0,
                 callbacks: Optional[List[Callback]] = None,
-                checkpoint_callback: bool = True) -> Trainer:
+                checkpoint_callback: bool = True,
+                **trainer_kwargs) -> Trainer:
     """Returns a Pytorch Lightning Trainer with the provided arguments."""
     callbacks = [] if not callbacks else callbacks
     trainer = pl.Trainer(
         default_root_dir=dir,
         gpus=1 if use_gpu else 0,
+        callbacks=callbacks,
+        plugins=plugins,
         max_epochs=max_epochs,
         limit_train_batches=limit_train_batches,
         limit_val_batches=limit_val_batches,
         progress_bar_refresh_rate=progress_bar_refresh_rate,
         checkpoint_callback=checkpoint_callback,
-        callbacks=callbacks,
-        plugins=plugins)
+        **trainer_kwargs)
     return trainer
 
 
 def train_test(trainer: Trainer, model: LightningModule):
     """Checks if training the provided model updates its weights."""
     initial_values = torch.tensor(
         [torch.sum(torch.abs(x)) for x in model.parameters()])
-    result = trainer.fit(model)
+    trainer.fit(model)
     post_train_values = torch.tensor(
         [torch.sum(torch.abs(x)) for x in model.parameters()])
-    assert result == 1, "trainer failed"
+    assert trainer.state.finished, f"Trainer failed with {trainer.state}"
     # Check that the model is actually changed post-training.
     assert torch.norm(initial_values - post_train_values) > 0.1
 
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -4,7 +4,7 @@ flake8-comprehensions
 flake8-quotes
 yapf==0.23.0
 pytest
-pytorch-lightning==1.2.10
+pytorch-lightning==1.4.1
 lightning-bolts==0.3.3
 ray[tune]
 torch==1.8.1