Custom Accelerator so driver doesn't require GPU (#67)

amogkam · web-flow · commit 4dcbdf0aa4a9 · 2021-08-10T12:18:30.000-07:00
* wip

* wip

* support 1.3.8

* increase timeout

* upgrade

* fix failing test

* wip

* update

* add comment

* fix

* remove server address
diff --git a/README.md b/README.md
@@ -25,9 +25,9 @@ from ray_lightning import RayPlugin
 ptl_model = MNISTClassifier(...)
 plugin = RayPlugin(num_workers=4, cpus_per_worker=1, use_gpu=True)
 
-# If using GPUs, set the ``gpus`` arg to a value > 0.
+# Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_workers``.
-trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
+trainer = pl.Trainer(..., plugins=[plugin])
 trainer.fit(ptl_model)
 ```
 
@@ -48,9 +48,9 @@ ptl_model = MNISTClassifier(...)
 # 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
 plugin = HorovodRayPlugin(num_hosts=2, num_slots=4, use_gpu=True)
 
-# If using GPUs, set the ``gpus`` arg to a value > 0.
+# Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_slots``.
-trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
+trainer = pl.Trainer(..., plugins=[plugin])
 trainer.fit(ptl_model)
 ```
 
@@ -66,9 +66,9 @@ from ray_lightning import RayShardedPlugin
 ptl_model = MNISTClassifier(...)
 plugin = RayShardedPlugin(num_workers=4, cpus_per_worker=1, use_gpu=True)
 
-# If using GPUs, set the ``gpus`` arg to a value > 0.
+# Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_workers``.
-trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
+trainer = pl.Trainer(..., plugins=[plugin])
 trainer.fit(ptl_model)
 ```
 See the [Pytorch Lightning docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html#sharded-training) for more information on sharded training.
diff --git a/ray_lightning/examples/ray_ddp_example.py b/ray_lightning/examples/ray_ddp_example.py
@@ -72,7 +72,6 @@ def train_mnist(config,
 
     trainer = pl.Trainer(
         max_epochs=num_epochs,
-        gpus=int(use_gpu),
         callbacks=callbacks,
         plugins=[RayPlugin(num_workers=num_workers, use_gpu=use_gpu)],
         **trainer_kwargs)
@@ -111,7 +110,6 @@ def tune_mnist(data_dir,
         num_samples=num_samples,
         resources_per_trial={
             "cpu": 1,
-            "gpu": int(use_gpu),
             "extra_cpu": num_workers,
             "extra_gpu": num_workers * int(use_gpu)
         },
@@ -152,11 +150,6 @@ def tune_mnist(data_dir,
         required=False,
         type=str,
         help="the address to use for Ray")
-    parser.add_argument(
-        "--server-address",
-        required=False,
-        type=str,
-        help="If using Ray Client, the address of the server to connect to. ")
     args, _ = parser.parse_known_args()
 
     num_epochs = 1 if args.smoke_test else args.num_epochs
@@ -166,8 +159,6 @@ def tune_mnist(data_dir,
 
     if args.smoke_test:
         ray.init(num_cpus=2)
-    elif args.server_address:
-        ray.util.connect(args.server_address)
     else:
         ray.init(address=args.address)
 
diff --git a/ray_lightning/examples/ray_ddp_sharded_example.py b/ray_lightning/examples/ray_ddp_sharded_example.py
@@ -63,7 +63,6 @@ def download_data():
 
     trainer = pl.Trainer(
         max_epochs=max_epochs,
-        gpus=int(use_gpu),
         precision=16 if use_gpu else 32,
         callbacks=[CUDACallback()] if use_gpu else [],
         plugins=plugin,
diff --git a/ray_lightning/examples/ray_ddp_tune.py b/ray_lightning/examples/ray_ddp_tune.py
@@ -30,7 +30,6 @@ def download_data():
 
     trainer = pl.Trainer(
         max_epochs=num_epochs,
-        gpus=int(use_gpu),
         callbacks=callbacks,
         progress_bar_refresh_rate=0,
         plugins=[
@@ -74,7 +73,6 @@ def tune_mnist(data_dir,
         num_samples=num_samples,
         resources_per_trial={
             "cpu": 1,
-            "gpu": int(use_gpu),
             "extra_cpu": num_workers,
             "extra_gpu": num_workers * int(use_gpu)
         },
@@ -111,11 +109,6 @@ def tune_mnist(data_dir,
         required=False,
         type=str,
         help="the address to use for Ray")
-    parser.add_argument(
-        "--server-address",
-        required=False,
-        type=str,
-        help="If using Ray Client, the address of the server to connect to. ")
     args, _ = parser.parse_known_args()
 
     num_epochs = 1 if args.smoke_test else args.num_epochs
@@ -125,8 +118,6 @@ def tune_mnist(data_dir,
 
     if args.smoke_test:
         ray.init(num_cpus=2)
-    elif args.server_address:
-        ray.util.connect(args.server_address)
     else:
         ray.init(address=args.address)
 
diff --git a/ray_lightning/examples/ray_horovod_example.py b/ray_lightning/examples/ray_horovod_example.py
@@ -73,7 +73,6 @@ def train_mnist(config,
 
     trainer = pl.Trainer(
         max_epochs=num_epochs,
-        gpus=int(use_gpu),
         callbacks=callbacks,
         plugins=[
             HorovodRayPlugin(
@@ -174,11 +173,6 @@ def tune_mnist(data_dir,
         required=False,
         type=str,
         help="the address to use for Ray")
-    parser.add_argument(
-        "--server-address",
-        required=False,
-        type=str,
-        help="If using Ray Client, the address of the server to connect to. ")
     args, _ = parser.parse_known_args()
 
     num_epochs = 1 if args.smoke_test else args.num_epochs
@@ -189,8 +183,6 @@ def tune_mnist(data_dir,
 
     if args.smoke_test:
         ray.init(num_cpus=2)
-    elif args.server_address:
-        ray.util.connect(args.server_address)
     else:
         ray.init(address=args.address)
 
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -1,17 +1,22 @@
+import io
 from typing import Callable, Dict, List, Union, Any
 
 import os
 from collections import defaultdict
 
-import ray
 import torch
+
+from pytorch_lightning.accelerators import CPUAccelerator
 from pytorch_lightning.plugins import DDPSpawnPlugin
 from pytorch_lightning import _logger as log, LightningModule
 from pytorch_lightning.utilities import rank_zero_only
+
+import ray
 from ray.util.sgd.utils import find_free_port
+from ray.util.queue import Queue
 
 from ray_lightning.session import init_session
-from ray_lightning.util import process_results, Queue
+from ray_lightning.util import process_results
 from ray_lightning.tune import TUNE_INSTALLED, is_session_enabled
 from ray_lightning.ray_environment import RayEnvironment
 
@@ -161,6 +166,15 @@ def _setup_env_vars(self):
         values = [os.getenv(k) for k in keys]
         ray.get([w.set_env_vars.remote(keys, values) for w in self.workers])
 
+    def _load_state_stream(self, state_stream):
+        _buffer = io.BytesIO(state_stream)
+        to_gpu = self.use_gpu and torch.cuda.is_available()
+        state_dict = torch.load(
+            _buffer,
+            map_location=("cpu" if not to_gpu
+                          else lambda storage, loc: storage.cuda()))
+        return state_dict
+
     def execution_loop(self, trainer, tune_enabled: bool = True):
         """Main execution loop for training, testing, & prediction.
 
@@ -194,7 +208,8 @@ def execution_loop(self, trainer, tune_enabled: bool = True):
 
         results = process_results(futures, queue)
         # Get the results, checkpoint path, and model weights from worker 0.
-        results, best_path, state_dict = results[0]
+        results, best_path, state_stream = results[0]
+        state_dict = self._load_state_stream(state_stream)
         # Set the state for PTL using the output from remote training.
         self._results = results
         self._model = model
@@ -209,6 +224,24 @@ def execution_loop(self, trainer, tune_enabled: bool = True):
 
         return results
 
+    def setup_environment(self) -> None:
+        # Swap out the accelerator if necessary.
+        # This is needed to support CPU head with GPU workers or Ray Client.
+        current_accelerator = self.lightning_module.trainer.accelerator
+        if self.use_gpu and isinstance(current_accelerator, CPUAccelerator):
+            from weakref import proxy
+            from ray_lightning.util import DelayedGPUAccelerator
+            precision_plugin = current_accelerator.precision_plugin
+            new_accelerator = DelayedGPUAccelerator(
+                precision_plugin=precision_plugin, training_type_plugin=self)
+            self.lightning_module.trainer.accelerator_connector\
+                ._training_type_plugin = \
+                proxy(new_accelerator.training_type_plugin)
+            self.lightning_module.trainer.accelerator_connector\
+                ._precision_plugin = proxy(new_accelerator.precision_plugin)
+            self.lightning_module.trainer.accelerator_connector.accelerator \
+                = new_accelerator
+
     def start_training(self, trainer):
         results = self.execution_loop(trainer, tune_enabled=True)
         # reset optimizers, since main process is never used for training and
@@ -268,7 +301,7 @@ def execute_remote(self,
             mp_queue=None)
         # Only need results from worker 0.
         if self.global_rank == 0:
-            return self.results, self.best_model_path, self.model_state_dict
+            return self.results, self.best_model_path, self.model_state_stream
         else:
             return None
 
@@ -307,12 +340,18 @@ def root_device(self):
         else:
             return torch.device("cpu")
 
+    def _to_state_stream(self, model_state_dict):
+        _buffer = io.BytesIO()
+        torch.save(model_state_dict, _buffer)
+        return _buffer.getvalue()
+
     def transfer_distrib_spawn_state_on_fit_end(self, results):
         """Sets the training output as attributes so it can be retrieved."""
         if self.global_rank == 0:
             # Save training results as attributes.
             self._results = results
-            self.model_state_dict = self.lightning_module.state_dict()
+            self.model_state_stream = \
+                self._to_state_stream(self.lightning_module.state_dict())
             best_model_path = None
             if self.lightning_module.trainer.checkpoint_callback is not None:
                 best_model_path = \
diff --git a/ray_lightning/ray_horovod.py b/ray_lightning/ray_horovod.py
@@ -1,12 +1,14 @@
-import ray
 import torch
 from pytorch_lightning import LightningModule
 from pytorch_lightning.plugins import HorovodPlugin
 from pytorch_lightning.utilities import rank_zero_only
+
+import ray
 from ray import ObjectRef
+from ray.util.queue import Queue
 
 from ray_lightning.session import init_session
-from ray_lightning.util import process_results, Queue, Unavailable
+from ray_lightning.util import process_results, Unavailable
 from ray_lightning.tune import TUNE_INSTALLED, is_session_enabled
 
 try:
diff --git a/ray_lightning/util.py b/ray_lightning/util.py
@@ -1,88 +1,30 @@
-# Remove after Ray 1.2 release.
-import asyncio
-from typing import Optional, Dict, Callable
+from typing import Callable
 
-import ray
-from ray.util.queue import Queue as RayQueue, Empty, Full
-
-
-class Unavailable:
-    """No object should be instance of this class"""
-
-    def __init__(self, *args, **kwargs):
-        raise RuntimeError("This class should never be instantiated.")
-
-
-# Remove after Ray 1.2 release.
-if getattr(RayQueue, "shutdown", None) is not None:
-    from ray.util.queue import _QueueActor
-else:
-    # On Ray <v1.2, we have to create our own class so we can create it with
-    # custom resources.
-    class _QueueActor:
-        """A class with basic Queue functionality."""
+from pytorch_lightning.accelerators import GPUAccelerator
+from pytorch_lightning import Trainer, LightningModule
 
-        def __init__(self, maxsize):
-            self.maxsize = maxsize
-            self.queue = asyncio.Queue(self.maxsize)
-
-        def qsize(self):
-            return self.queue.qsize()
-
-        def empty(self):
-            return self.queue.empty()
-
-        def full(self):
-            return self.queue.full()
-
-        async def put(self, item, timeout=None):
-            try:
-                await asyncio.wait_for(self.queue.put(item), timeout)
-            except asyncio.TimeoutError:
-                raise Full
+import ray
 
-        async def get(self, timeout=None):
-            try:
-                return await asyncio.wait_for(self.queue.get(), timeout)
-            except asyncio.TimeoutError:
-                raise Empty
 
-        def put_nowait(self, item):
-            self.queue.put_nowait(item)
+class DelayedGPUAccelerator(GPUAccelerator):
+    """Same as GPUAccelerator, but doesn't do any CUDA setup.
 
-        def put_nowait_batch(self, items):
-            # If maxsize is 0, queue is unbounded, so no need to check size.
-            if self.maxsize > 0 and len(items) + self.qsize() > self.maxsize:
-                raise Full(f"Cannot add {len(items)} items to queue of size "
-                           f"{self.qsize()} and maxsize {self.maxsize}.")
-            for item in items:
-                self.queue.put_nowait(item)
+    This allows the driver script to be launched from CPU-only machines (
+    like the laptop) but have training still execute on GPU.
+    """
 
-        def get_nowait(self):
-            return self.queue.get_nowait()
+    def setup(self, trainer: Trainer, model: LightningModule) -> None:
+        return super(GPUAccelerator, self).setup(trainer, model)
 
-        def get_nowait_batch(self, num_items):
-            if num_items > self.qsize():
-                raise Empty(f"Cannot get {num_items} items from queue of size "
-                            f"{self.qsize()}.")
-            return [self.queue.get_nowait() for _ in range(num_items)]
+    def on_train_start(self) -> None:
+        super(DelayedGPUAccelerator, self).on_train_start()
 
 
-class Queue(RayQueue):
-    def __init__(self, maxsize: int = 0,
-                 actor_options: Optional[Dict] = None) -> None:
-        actor_options = actor_options or {}
-        self.maxsize = maxsize
-        self.actor = ray.remote(_QueueActor).options(**actor_options).remote(
-            self.maxsize)
+class Unavailable:
+    """No object should be instance of this class"""
 
-    def shutdown(self):
-        if getattr(RayQueue, "shutdown", None) is not None:
-            super(Queue, self).shutdown()
-        else:
-            if self.actor:
-                ray.kill(self.actor)
-            self.actor = None
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError("This class should never be instantiated.")
 
 
 def _handle_queue(queue):