Fix Tune GPU Checkpointing (#70)

amogkam · web-flow · commit f2ffd82b1e80 · 2021-08-11T12:39:08.000-07:00
* fix

* add gpu test

* update docs
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -1,4 +1,3 @@
-import io
 import socket
 from contextlib import closing
 from typing import Callable, Dict, List, Union, Any
@@ -17,7 +16,8 @@
 from ray.util.queue import Queue
 
 from ray_lightning.session import init_session
-from ray_lightning.util import process_results
+from ray_lightning.util import process_results, to_state_stream, \
+    load_state_stream
 from ray_lightning.tune import TUNE_INSTALLED, is_session_enabled
 from ray_lightning.ray_environment import RayEnvironment
 
@@ -174,15 +174,6 @@ def _setup_env_vars(self):
         values = [os.getenv(k) for k in keys]
         ray.get([w.set_env_vars.remote(keys, values) for w in self.workers])
 
-    def _load_state_stream(self, state_stream):
-        _buffer = io.BytesIO(state_stream)
-        to_gpu = self.use_gpu and torch.cuda.is_available()
-        state_dict = torch.load(
-            _buffer,
-            map_location=("cpu" if not to_gpu
-                          else lambda storage, loc: storage.cuda()))
-        return state_dict
-
     def execution_loop(self, trainer, tune_enabled: bool = True):
         """Main execution loop for training, testing, & prediction.
 
@@ -217,7 +208,7 @@ def execution_loop(self, trainer, tune_enabled: bool = True):
         results = process_results(futures, queue)
         # Get the results, checkpoint path, and model weights from worker 0.
         results, best_path, state_stream = results[0]
-        state_dict = self._load_state_stream(state_stream)
+        state_dict = load_state_stream(state_stream, to_gpu=self.use_gpu)
         # Set the state for PTL using the output from remote training.
         self._results = results
         self._model = model
@@ -348,18 +339,13 @@ def root_device(self):
         else:
             return torch.device("cpu")
 
-    def _to_state_stream(self, model_state_dict):
-        _buffer = io.BytesIO()
-        torch.save(model_state_dict, _buffer)
-        return _buffer.getvalue()
-
     def transfer_distrib_spawn_state_on_fit_end(self, results):
         """Sets the training output as attributes so it can be retrieved."""
         if self.global_rank == 0:
             # Save training results as attributes.
             self._results = results
             self.model_state_stream = \
-                self._to_state_stream(self.lightning_module.state_dict())
+                to_state_stream(self.lightning_module.state_dict())
             best_model_path = None
             if self.lightning_module.trainer.checkpoint_callback is not None:
                 best_model_path = \
diff --git a/ray_lightning/tests/test_tune.py b/ray_lightning/tests/test_tune.py
@@ -2,6 +2,7 @@
 import pytest
 
 import ray
+import torch
 from ray import tune
 
 from ray_lightning import RayPlugin, HorovodRayPlugin
@@ -84,3 +85,19 @@ def test_checkpoint_horovod(tmpdir, ray_start_4_cpus):
     """Tests if Tune checkpointing works with HorovodRayAccelerator."""
     plugin = HorovodRayPlugin(num_hosts=1, num_slots=2, use_gpu=False)
     checkpoint_test(tmpdir, plugin)
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_checkpoint_ddp_gpu(tmpdir, ray_start_4_cpus):
+    """Tests if Tune checkpointing works with RayAccelerator."""
+    plugin = RayPlugin(num_workers=2, use_gpu=False)
+    checkpoint_test(tmpdir, plugin)
+
+
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+def test_checkpoint_horovod_gpu(tmpdir, ray_start_4_cpus):
+    """Tests if Tune checkpointing works with HorovodRayAccelerator."""
+    plugin = HorovodRayPlugin(num_hosts=1, num_slots=2, use_gpu=False)
+    checkpoint_test(tmpdir, plugin)
diff --git a/ray_lightning/tune.py b/ray_lightning/tune.py
@@ -1,12 +1,12 @@
 from typing import Dict, List, Union
 
+import fsspec
 import os
 
-from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning import Trainer, LightningModule
 
 from ray_lightning.session import put_queue, get_actor_rank
-from ray_lightning.util import Unavailable
+from ray_lightning.util import to_state_stream, Unavailable
 
 try:
     from ray import tune
@@ -143,20 +143,23 @@ def __init__(self,
             self._filename = filename
 
         @staticmethod
-        def _create_checkpoint(checkpoint_dict: dict, global_step: int,
+        def _create_checkpoint(checkpoint_stream, global_step: int,
                                filename: str):
             with tune.checkpoint_dir(step=global_step) as checkpoint_dir:
                 file_path = os.path.join(checkpoint_dir, filename)
-                atomic_save(checkpoint_dict, file_path)
+                with fsspec.open(file_path, "wb") as f:
+                    f.write(checkpoint_stream)
 
         def _handle(self, trainer: Trainer, pl_module: LightningModule):
             if trainer.running_sanity_check:
                 return
             checkpoint_dict = trainer.checkpoint_connector.dump_checkpoint()
+            # Convert to a state stream first.
+            checkpoint_stream = to_state_stream(checkpoint_dict)
             global_step = trainer.global_step
             if get_actor_rank() == 0:
                 put_queue(lambda: self._create_checkpoint(
-                    checkpoint_dict, global_step, self._filename))
+                    checkpoint_stream, global_step, self._filename))
 
     class TuneReportCheckpointCallback(TuneCallback):
         """PyTorch Lightning to Tune reporting and checkpointing callback.
diff --git a/ray_lightning/util.py b/ray_lightning/util.py
@@ -1,5 +1,7 @@
+import io
 from typing import Callable
 
+import torch
 from pytorch_lightning.accelerators import GPUAccelerator
 from pytorch_lightning import Trainer, LightningModule
 
@@ -51,3 +53,25 @@ def process_results(training_result_futures, queue):
         # Process any remaining items in queue.
         _handle_queue(queue)
     return ray.get(training_result_futures)
+
+
+def to_state_stream(model_state_dict):
+    """Converts the given state dict to a stream of bytes."""
+    _buffer = io.BytesIO()
+    torch.save(model_state_dict, _buffer)
+    return _buffer.getvalue()
+
+
+def load_state_stream(state_stream, to_gpu):
+    """Converts the state stream to a state dict on the appropriate device.
+
+    Converts to GPU if ``to_gpu`` is True and CUDA is available.
+
+    """
+    _buffer = io.BytesIO(state_stream)
+    to_gpu = to_gpu and torch.cuda.is_available()
+    state_dict = torch.load(
+        _buffer,
+        map_location=("cpu"
+                      if not to_gpu else lambda storage, loc: storage.cuda()))
+    return state_dict