ray-project
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 21 additions & 1 deletion b/‎README.md‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎examples/ray_ddp_sharded_example.py‎
Lines changed: 134 additions & 0 deletions b/‎examples/ray_ddp_sharded_example.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎ray_lightning/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎ray_lightning/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎ray_lightning/ray_ddp.py‎
Lines changed: 39 additions & 18 deletions b/‎ray_lightning/ray_ddp.py‎
Lines changed: 39 additions & 18 deletions
diff --git a/‎ray_lightning/ray_ddp_sharded.py‎
Lines changed: 66 additions & 0 deletions b/‎ray_lightning/ray_ddp_sharded.py‎
Lines changed: 66 additions & 0 deletions
@@ -16,7 +16,7 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         python -m pip install codecov
-        python -m pip install -U yapf==0.23.0 flake8==3.7.7 flake8-comprehensions flake8-quotes==2.0.0
+        python -m pip install -U yapf==0.23.0 flake8==3.9.1 flake8-comprehensions flake8-quotes
     - name: Run format script
       run: |
         ./format.sh --all
@@ -46,6 +46,7 @@ jobs:
           python -m pytest -v --durations=0 -x test_ddp.py
           python -m pytest -v --durations=0 -x test_horovod.py
           python -m pytest -v --durations=0 -x test_tune.py
+          python -m pytest -v --durations=0 -x test_ddp_sharded.py
 
   test_linux_ray_master_examples:
     runs-on: ubuntu-latest
@@ -102,6 +103,7 @@ jobs:
           python -m pytest -v --durations=0 -x test_ddp.py
           python -m pytest -v --durations=0 -x test_horovod.py
           python -m pytest -v --durations=0 -x test_tune.py
+          python -m pytest -v --durations=0 -x test_ddp_sharded.py
 
 
   test_linux_ray_release_examples:
 
@@ -39,7 +39,7 @@ Because Ray is used to launch processes, instead of the same script being called
 Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayPlugin` instead.
 
 ```python
-import pytorch_lightning as ptl
+import pytorch_lightning as pl
 from ray_lightning import HorovodRayPlugin
 
 # Create your PyTorch Lightning model here.
@@ -54,12 +54,32 @@ trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
 trainer.fit(ptl_model)
 ```
 
+## Model Parallel Sharded Training on Ray
+The `RayShardedPlugin` integrates with [FairScale](https://github.com/facebookresearch/fairscale) to provide sharded DDP training on a Ray cluster.
+With sharded training, leverage the scalability of data parallel training while drastically reducing memory usage when training large models.
+
+```python
+import pytorch_lightning as pl
+from ray_lightning import RayShardedPlugin
+
+# Create your PyTorch Lightning model here.
+ptl_model = MNISTClassifier(...)
+plugin = RayShardedPlugin(num_workers=4, cpus_per_worker=1, use_gpu=True)
+
+# If using GPUs, set the ``gpus`` arg to a value > 0.
+# The actual number of GPUs is determined by ``num_workers``.
+trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
+trainer.fit(ptl_model)
+```
+See the [Pytorch Lightning docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html#sharded-training) for more information on sharded training.
+
 ## Multi-node Distributed Training
 Using the same examples above, you can run distributed training on a multi-node cluster with just 2 simple steps.
 1) [Use Ray's cluster launcher](https://docs.ray.io/en/master/cluster/launcher.html) to start a Ray cluster- `ray up my_cluster_config.yaml`.
 2) [Execute your Python script on the Ray cluster](https://docs.ray.io/en/master/cluster/commands.html#running-ray-scripts-on-the-cluster-ray-submit)- `ray submit my_cluster_config.yaml train.py`. This will `rsync` your training script to the head node, and execute it on the Ray cluster.
 
 You no longer have to set environment variables or configurations and run your training script on every single node.
+
 ## Hyperparameter Tuning with Ray Tune
 `ray_lightning` also integrates with Ray Tune to provide distributed hyperparameter tuning for your distributed model training. You can run multiple PyTorch Lightning training runs in parallel, each with a different hyperparameter configuration, and each training run parallelized by itself. All you have to do is move your training code to a function, pass the function to tune.run, and make sure to add the appropriate callback (Either `TuneReportCallback` or `TuneReportCheckpointCallback`) to your PyTorch Lightning Trainer.
 
 
@@ -0,0 +1,134 @@
+import os
+import tempfile
+import time
+
+import ray
+import torch
+from pl_bolts.datamodules import MNISTDataModule
+from pl_bolts.models.vision import ImageGPT
+
+import pytorch_lightning as pl
+from pytorch_lightning import Callback
+
+from ray_lightning import RayShardedPlugin
+
+
+class CUDACallback(Callback):
+    def on_train_epoch_start(self, trainer, pl_module):
+        # Reset the memory use counter
+        torch.cuda.reset_peak_memory_stats(trainer.root_gpu)
+        torch.cuda.synchronize(trainer.root_gpu)
+        self.start_time = time.time()
+
+    def on_train_epoch_end(self, trainer, pl_module, outputs):
+        torch.cuda.synchronize(trainer.root_gpu)
+        max_memory = torch.cuda.max_memory_allocated(trainer.root_gpu) / 2**20
+        epoch_time = time.time() - self.start_time
+
+        max_memory = torch.tensor(
+            max_memory, dtype=torch.int, device=trainer.root_gpu)
+        epoch_time = torch.tensor(
+            epoch_time, dtype=torch.int, device=trainer.root_gpu)
+
+        torch.distributed.all_reduce(
+            max_memory, op=torch.distributed.ReduceOp.SUM)
+        torch.distributed.all_reduce(
+            epoch_time, op=torch.distributed.ReduceOp.SUM)
+
+        world_size = torch.distributed.get_world_size()
+
+        print(
+            f"Average Epoch time: {epoch_time.item() / float(world_size):.2f} "
+            f"seconds")
+        print(
+            f"Average Peak memory {max_memory.item() / float(world_size):.2f}"
+            f"MiB")
+
+
+def train(data_dir, num_workers, use_gpu, batch_size, embed_dim, max_epochs,
+          max_steps):
+    # Make sure data is downloaded on all nodes.
+    def download_data():
+        from filelock import FileLock
+        with FileLock(os.path.join(data_dir, ".lock")):
+            MNISTDataModule(data_dir=data_dir).prepare_data()
+
+    plugin = RayShardedPlugin(
+        num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data)
+
+    dm = MNISTDataModule(data_dir, batch_size=batch_size)
+
+    model = ImageGPT(
+        embed_dim=embed_dim, layers=16, heads=4, vocab_size=32, num_pixels=28)
+
+    trainer = pl.Trainer(
+        max_epochs=max_epochs,
+        gpus=int(use_gpu),
+        precision=16 if use_gpu else 32,
+        callbacks=[CUDACallback()] if use_gpu else [],
+        plugins=plugin,
+        max_steps=max_steps)
+
+    trainer.fit(model, dm)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="Number of training workers to use.",
+        default=1)
+    parser.add_argument(
+        "--use-gpu", action="store_true", help="Use GPU for training.")
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=10,
+        help="Number of epochs to train for.")
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size to use for training.")
+    parser.add_argument(
+        "--embed-dim",
+        type=int,
+        default=2048,
+        help="Number of embedding dimensions for ImageGPT model.")
+    parser.add_argument(
+        "--smoke-test", action="store_true", help="Finish quickly for testing")
+    parser.add_argument(
+        "--address",
+        required=False,
+        type=str,
+        help="the address to use for Ray")
+    args, _ = parser.parse_known_args()
+
+    if args.smoke_test:
+        ray.init(num_cpus=2)
+    else:
+        ray.init(address=args.address)
+
+    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
+
+    if args.smoke_test:
+        train(
+            data_dir=data_dir,
+            num_workers=2,
+            use_gpu=False,
+            batch_size=32,
+            embed_dim=16,
+            max_epochs=1,
+            max_steps=1)
+    else:
+        train(
+            data_dir=data_dir,
+            num_workers=args.num_workers,
+            use_gpu=args.use_gpu,
+            batch_size=args.batch_size,
+            embed_dim=args.embed_dim,
+            max_epochs=args.num_epochs,
+            max_steps=None)
@@ -1,4 +1,5 @@
 from ray_lightning.ray_ddp import RayPlugin
 from ray_lightning.ray_horovod import HorovodRayPlugin
+from ray_lightning.ray_ddp_sharded import RayShardedPlugin
 
-__all__ = ["RayPlugin", "HorovodRayPlugin"]
+__all__ = ["RayPlugin", "HorovodRayPlugin", "RayShardedPlugin"]
@@ -146,15 +146,7 @@ def get_local_ranks(self) -> Dict[int, int]:
             rank_counter_dict[ip] += 1
         return global_to_local
 
-    def start_training(self, trainer):
-        """Main training loop.
-
-        Sets up the torch.distributed process group for each training
-        worker. Then trigger remote training via ``train_remote`` on each
-        worker. If using with Ray Tune, create a communication queue to
-        revieve intermediate results, and process those results. Finally
-        retrieve the training results from the rank 0 worker and return."""
-
+    def _setup_env_vars(self):
         # Get rank 0 worker address and port for DDP connection.
         os.environ["MASTER_ADDR"] = ray.get(
             self.workers[0].get_node_ip.remote())
@@ -169,6 +161,19 @@ def start_training(self, trainer):
         values = [os.getenv(k) for k in keys]
         ray.get([w.set_env_vars.remote(keys, values) for w in self.workers])
 
+    def execution_loop(self, trainer, tune_enabled: bool = True):
+        """Main execution loop for training, testing, & prediction.
+
+        Sets up the torch.distributed process group for each
+        worker. Then trigger remote training/testing/eval via
+        ``train_remote`` on each worker. If using with Ray Tune, create a
+        communication queue to retrieve intermediate results, and process
+        those results. Finally retrieve the training results from the rank 0
+        worker and return."""
+
+        # Sets environment variables for all workers.
+        self._setup_env_vars()
+
         self.global_to_local = self.get_local_ranks()
 
         model = self._model
@@ -177,12 +182,12 @@ def start_training(self, trainer):
         self._model = None
 
         queue = None
-        if TUNE_INSTALLED and is_session_enabled():
+        if tune_enabled and TUNE_INSTALLED and is_session_enabled():
             # Create communication queue and send to all the workers.
             queue = Queue(actor_options={"num_cpus": 0})
 
         futures = [
-            self.workers[i].execute.remote(self.train_remote, model_ref, i,
+            self.workers[i].execute.remote(self.execute_remote, model_ref, i,
                                            queue)
             for i in range(self.num_workers)
         ]
@@ -195,7 +200,7 @@ def start_training(self, trainer):
         self._model = model
         self._model.load_state_dict(state_dict)
         if self.lightning_module.trainer.checkpoint_callback:
-            self.lightning_module.trainer.checkpoint_callback\
+            self.lightning_module.trainer.checkpoint_callback \
                 .best_model_path = best_path
 
         if queue:
@@ -204,6 +209,21 @@ def start_training(self, trainer):
 
         return results
 
+    def start_training(self, trainer):
+        results = self.execution_loop(trainer, tune_enabled=True)
+        # reset optimizers, since main process is never used for training and
+        # thus does not have a valid optim state.
+        trainer.optimizers = []
+        return results
+
+    def start_testing(self, trainer):
+        results = self.execution_loop(trainer, tune_enabled=False)
+        return results
+
+    def start_predicting(self, trainer):
+        results = self.execution_loop(trainer, tune_enabled=False)
+        return results
+
     def post_dispatch(self):
         """Shutdown the DDP process group and all the Ray actors. """
 
@@ -220,18 +240,19 @@ def shutdown_remote():
 
     # All methods below are only executed in remote Ray workers.
 
-    def train_remote(self,
-                     model: LightningModule,
-                     global_rank: int,
-                     queue: Queue = None):
-        """Training function to be executed on each remote worker."""
+    def execute_remote(self,
+                       model: LightningModule,
+                       global_rank: int,
+                       queue: Queue = None):
+        """Train/test/eval function to be executed on each remote worker."""
         assert isinstance(self, RayPlugin)
         # This method should be executed remotely in each worker.
         self._model = model
         self.lightning_module.trainer.accelerator_connector\
             ._training_type_plugin = self
         self.lightning_module.trainer.accelerator.training_type_plugin = self
         self.cluster_environment.set_global_rank(global_rank)
+        self.cluster_environment.set_remote_execution(True)
 
         if queue is not None:
             # Initialize session.
@@ -272,7 +293,7 @@ def init_ddp_connection(self,
     def set_world_ranks(self, process_idx: int = 0):
         """Set the appropriate rank attribues for the trainer."""
         assert self.cluster_environment is not None
-        if self.global_rank is not None:
+        if self.cluster_environment.is_remote():
             self._local_rank = self.global_to_local[self.global_rank]
             self.cluster_environment.set_global_rank(self.global_rank)
             self.cluster_environment.set_world_size(self.num_workers)
 
@@ -0,0 +1,66 @@
+from typing import Optional
+
+import torch
+from torch.optim import Optimizer
+
+from pytorch_lightning import LightningModule
+from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, rank_zero_only
+
+from ray_lightning import RayPlugin
+
+if _FAIRSCALE_AVAILABLE:
+    from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel
+    from fairscale.optim import OSS
+
+    from pytorch_lightning.overrides.fairscale import \
+        LightningShardedDataParallel, unwrap_lightning_module_sharded
+
+
+class RayShardedPlugin(RayPlugin):
+    def configure_ddp(self):
+        self._wrap_optimizers()
+        self._model = ShardedDataParallel(
+            LightningShardedDataParallel(self.model),
+            sharded_optimizer=self.lightning_module.trainer.optimizers)
+        setattr(self._model, "require_backward_grad_sync", False)
+
+    def _reinit_optimizers_with_oss(self):
+        optimizers = self.lightning_module.trainer.optimizers
+        for x, optimizer in enumerate(optimizers):
+            if not isinstance(optimizer, OSS):
+                optim_class = type(optimizer)
+                zero_optimizer = OSS(
+                    params=optimizer.param_groups,
+                    optim=optim_class,
+                    **optimizer.defaults)
+                optimizers[x] = zero_optimizer
+                del optimizer
+        trainer = self.lightning_module.trainer
+        trainer.optimizers = optimizers
+
+    def _wrap_optimizers(self):
+        trainer = self.model.trainer
+        if trainer.testing:
+            return
+        self._reinit_optimizers_with_oss()
+
+    def optimizer_state(self, optimizer: "OSS") -> Optional[dict]:
+        if isinstance(optimizer, OSS):
+            optimizer.consolidate_state_dict()
+        return self._optim_state_dict(optimizer)
+
+    @rank_zero_only
+    def _optim_state_dict(self, optimizer):
+        """Retrieves state dict only on rank 0."""
+        return optimizer.state_dict()
+
+    @property
+    def lightning_module(self) -> LightningModule:
+        return unwrap_lightning_module_sharded(self._model)
+
+    def pre_backward(self, closure_loss: torch.Tensor, should_accumulate: bool,
+                     optimizer: Optimizer, opt_idx: int):
+        pass
+
+    def post_training_step(self):
+        pass