PTL 1.2.10 Compatibility (#41)

amogkam · web-flow · commit 0a47405e5e62 · 2021-05-03T18:10:24.000-07:00
* pass in ddp_kwargs

* compat

* format

* hvd ranks default
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -7,11 +7,13 @@
 import torch
 from pytorch_lightning.plugins import DDPSpawnPlugin
 from pytorch_lightning import _logger as log, LightningModule
+from pytorch_lightning.utilities import rank_zero_only
 from ray.util.sgd.utils import find_free_port
 
 from ray_lightning.session import init_session
 from ray_lightning.util import process_results, Queue
 from ray_lightning.tune import TUNE_INSTALLED, is_session_enabled
+from ray_lightning.ray_environment import RayEnvironment
 
 
 @ray.remote
@@ -95,13 +97,17 @@ def __init__(self,
         if not ray.is_initialized():
             ray.init()
         super().__init__(
-            sync_batchnorm=None, parallel_devices=[], **ddp_kwargs)
+            sync_batchnorm=None,
+            parallel_devices=[],
+            cluster_environment=RayEnvironment(world_size=num_workers),
+            **ddp_kwargs)
         self.nickname = "ddp_ray"
         self.num_workers = num_workers
         self.num_cpus_per_worker = num_cpus_per_worker
         self.use_gpu = use_gpu
         self.workers = []
         self.init_hook = init_hook
+        self._local_rank = 0
 
     def _create_worker(self):
         """Creates Ray actor."""
@@ -225,7 +231,7 @@ def train_remote(self,
         self.lightning_module.trainer.accelerator_connector\
             ._training_type_plugin = self
         self.lightning_module.trainer.accelerator.training_type_plugin = self
-        self.global_rank = global_rank
+        self.cluster_environment.set_global_rank(global_rank)
 
         if queue is not None:
             # Initialize session.
@@ -263,11 +269,14 @@ def init_ddp_connection(self,
                 world_size=world_size,
             )
 
-    def set_world_ranks(self, process_idx: int):
+    def set_world_ranks(self, process_idx: int = 0):
         """Set the appropriate rank attribues for the trainer."""
-        self.local_rank = self.global_to_local[self.global_rank]
-        self.global_rank = self.global_rank
-        self.world_size = self.num_workers
+        assert self.cluster_environment is not None
+        if self.global_rank is not None:
+            self._local_rank = self.global_to_local[self.global_rank]
+            self.cluster_environment.set_global_rank(self.global_rank)
+            self.cluster_environment.set_world_size(self.num_workers)
+            rank_zero_only.rank = self.cluster_environment.global_rank()
 
     @property
     def root_device(self):
diff --git a/ray_lightning/ray_environment.py b/ray_lightning/ray_environment.py
@@ -0,0 +1,38 @@
+from pytorch_lightning.plugins.environments import ClusterEnvironment
+from pytorch_lightning.utilities import rank_zero_only
+
+
+class RayEnvironment(ClusterEnvironment):
+    """Environment for PTL training on a Ray cluster."""
+
+    def __init__(self, world_size):
+        self.set_world_size(world_size)
+        self._global_rank = None
+
+    def creates_children(self) -> bool:
+        return False
+
+    def master_address(self) -> str:
+        raise NotImplementedError
+
+    def master_port(self) -> int:
+        raise NotImplementedError
+
+    def world_size(self) -> int:
+        return self._world_size
+
+    def set_world_size(self, size: int) -> None:
+        self._world_size = size
+
+    def global_rank(self) -> int:
+        return self._global_rank
+
+    def set_global_rank(self, rank: int) -> None:
+        self._global_rank = rank
+        rank_zero_only.rank = rank
+
+    def local_rank(self) -> int:
+        raise NotImplementedError
+
+    def node_rank(self) -> int:
+        raise NotImplementedError
diff --git a/ray_lightning/ray_horovod.py b/ray_lightning/ray_horovod.py
@@ -92,6 +92,24 @@ def __setstate__(self, d):
         d["executor"] = None
         self.__dict__.update(d)
 
+    @property
+    def global_rank(self) -> int:
+        if not hvd.is_initialized():
+            return 0
+        return hvd.rank()
+
+    @property
+    def local_rank(self) -> int:
+        if not hvd.is_initialized():
+            return 0
+        return hvd.local_rank()
+
+    @property
+    def world_size(self) -> int:
+        if not hvd.is_initialized():
+            return self.num_hosts * self.num_slots
+        return hvd.size()
+
     def setup(self, model: LightningModule):
         """Creates the RayExecutor object."""
         self._model = model
@@ -152,9 +170,6 @@ def train_remote(self, model: ObjectRef, queue: Queue = None, **kwargs):
         self.lightning_module.trainer.accelerator.training_type_plugin = self
 
         hvd.init()
-        self.global_rank = hvd.rank()
-        self.local_rank = hvd.local_rank()
-        self.world_size = hvd.size()
         rank_zero_only.rank = self.global_rank
 
         if queue is not None:
diff --git a/ray_lightning/tests/test_ddp.py b/ray_lightning/tests/test_ddp.py
@@ -31,14 +31,14 @@ def test_actor_creation(tmpdir, ray_start_2_cpus, num_workers):
     model = BoringModel()
 
     def check_num_actor():
-        assert len(ray.actors()) == num_workers
+        assert len(ray.state.actors()) == num_workers
 
     model.on_epoch_end = check_num_actor
     plugin = RayPlugin(num_workers=num_workers)
     trainer = get_trainer(tmpdir, plugins=[plugin])
     trainer.fit(model)
     assert all(actor["State"] == ray.gcs_utils.ActorTableData.DEAD
-               for actor in list(ray.actors().values()))
+               for actor in list(ray.state.actors().values()))
 
 
 def test_distributed_sampler(tmpdir, ray_start_2_cpus):
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,9 +1,9 @@
-flake8==3.7.7
+flake8==3.9.1
 flake8-comprehensions
-flake8-quotes==2.0.0
+flake8-quotes
 yapf==0.23.0
 pytest
-pytorch-lightning==1.2.6
-lightning-bolts==0.3.2
+pytorch-lightning==1.2.10
+lightning-bolts==0.3.3
 ray[tune]
 torchvision