ray-project
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 30 additions & 26 deletions b/‎README.md‎
Lines changed: 30 additions & 26 deletions
diff --git a/‎docs/horovod_faq.md‎
Lines changed: 49 additions & 0 deletions b/‎docs/horovod_faq.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎ray_lightning/__init__.py‎
Lines changed: 4 additions & 4 deletions b/‎ray_lightning/__init__.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎ray_lightning/accelerators/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎ray_lightning/accelerators/__init__.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎ray_lightning/accelerators/delayed_gpu_accelerator.py‎
Lines changed: 60 additions & 0 deletions b/‎ray_lightning/accelerators/delayed_gpu_accelerator.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎ray_lightning/examples/ray_ddp_example.py‎
Lines changed: 2 additions & 2 deletions b/‎ray_lightning/examples/ray_ddp_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ray_lightning/examples/ray_ddp_sharded_example.py‎
Lines changed: 3 additions & 3 deletions b/‎ray_lightning/examples/ray_ddp_sharded_example.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎ray_lightning/examples/ray_ddp_tune.py‎
Lines changed: 3 additions & 7 deletions b/‎ray_lightning/examples/ray_ddp_tune.py‎
Lines changed: 3 additions & 7 deletions
@@ -35,7 +35,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install --upgrade setuptools
           python -m pip install codecov
-          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
           if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
       - name: Install package
         run: |
@@ -60,7 +60,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install --upgrade setuptools
           python -m pip install codecov
-          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
           if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
           HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
       - name: Install package
@@ -86,7 +86,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install --upgrade setuptools
           python -m pip install codecov
-          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
+          python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
           if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
           HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
       - name: Install package
 
@@ -1,11 +1,11 @@
 <!--$UNCOMMENT(ray-lightning)=-->
 
 # Distributed PyTorch Lightning Training on Ray
-This library adds new PyTorch Lightning plugins for distributed training using the Ray distributed computing framework.
+This library adds new PyTorch Lightning strategies for distributed training using the Ray distributed computing framework.
 
-These PyTorch Lightning Plugins on Ray enable quick and easy parallel training while still leveraging all the benefits of PyTorch Lightning and using your desired training protocol, either [PyTorch Distributed Data Parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) or [Horovod](https://github.com/horovod/horovod). 
+These PyTorch Lightning strategies on Ray enable quick and easy parallel training while still leveraging all the benefits of PyTorch Lightning and using your desired training protocol, either [PyTorch Distributed Data Parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) or [Horovod](https://github.com/horovod/horovod). 
 
-Once you add your plugin to the PyTorch Lightning Trainer, you can parallelize training to all the cores in your laptop, or across a massive multi-node, multi-GPU cluster with no additional code changes.
+Once you add your strategy to the PyTorch Lightning Trainer, you can parallelize training to all the cores in your laptop, or across a massive multi-node, multi-GPU cluster with no additional code changes.
 
 This library also comes with an integration with <!--$UNCOMMENT{ref}`Ray Tune <tune-main>`--><!--$REMOVE-->[Ray Tune](https://tune.io)<!--$END_REMOVE--> for distributed hyperparameter tuning experiments.
 
@@ -39,29 +39,30 @@ Here are the supported PyTorch Lightning versions:
 |---|---|
 | 0.1 | 1.4 |
 | 0.2 | 1.5 |
-| master | 1.5 |
+| 0.3 | 1.6 |
+| master | 1.6 |
 
 
-## PyTorch Distributed Data Parallel Plugin on Ray
-The `RayPlugin` provides Distributed Data Parallel training on a Ray cluster. PyTorch DDP is used as the distributed training protocol, and Ray is used to launch and manage the training worker processes.
+## PyTorch Distributed Data Parallel Strategy on Ray
+The `RayStrategy` provides Distributed Data Parallel training on a Ray cluster. PyTorch DDP is used as the distributed training protocol, and Ray is used to launch and manage the training worker processes.
 
 Here is a simplified example:
 
 ```python
 import pytorch_lightning as pl
-from ray_lightning import RayPlugin
+from ray_lightning import RayStrategy
 
 # Create your PyTorch Lightning model here.
 ptl_model = MNISTClassifier(...)
-plugin = RayPlugin(num_workers=4, num_cpus_per_worker=1, use_gpu=True)
+strategy = RayStrategy(num_workers=4, num_cpus_per_worker=1, use_gpu=True)
 
 # Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_workers``.
-trainer = pl.Trainer(..., plugins=[plugin])
+trainer = pl.Trainer(..., strategy=strategy)
 trainer.fit(ptl_model)
 ```
 
-Because Ray is used to launch processes, instead of the same script being called multiple times, you CAN use this plugin even in cases when you cannot use the standard `DDPPlugin` such as 
+Because Ray is used to launch processes, instead of the same script being called multiple times, you CAN use this strategy even in cases when you cannot use the standard `DDPStrategy` such as 
 - Jupyter Notebooks, Google Colab, Kaggle
 - Calling `fit` or `test` multiple times in the same script
 
@@ -94,40 +95,40 @@ Now you can run your training script on the laptop, but have it execute as if yo
 
 **Note:** When using with Ray Client, you must disable checkpointing and logging for your Trainer by setting `checkpoint_callback` and `logger` to `False`.
 
-## Horovod Plugin on Ray
-Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayPlugin` instead.
+## Horovod Strategy on Ray
+Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayStrategy` instead.
 
 ```python
 import pytorch_lightning as pl
-from ray_lightning import HorovodRayPlugin
+from ray_lightning import HorovodRayStrategy
 
 # Create your PyTorch Lightning model here.
 ptl_model = MNISTClassifier(...)
 
 # 2 workers, 1 CPU and 1 GPU each.
-plugin = HorovodRayPlugin(num_workers=2, use_gpu=True)
+strategy = HorovodRayStrategy(num_workers=2, use_gpu=True)
 
 # Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_workers``.
-trainer = pl.Trainer(..., plugins=[plugin])
+trainer = pl.Trainer(..., strategy=strategy)
 trainer.fit(ptl_model)
 ```
 
 ## Model Parallel Sharded Training on Ray
-The `RayShardedPlugin` integrates with [FairScale](https://github.com/facebookresearch/fairscale) to provide sharded DDP training on a Ray cluster.
+The `RayShardedStrategy` integrates with [FairScale](https://github.com/facebookresearch/fairscale) to provide sharded DDP training on a Ray cluster.
 With sharded training, leverage the scalability of data parallel training while drastically reducing memory usage when training large models.
 
 ```python
 import pytorch_lightning as pl
-from ray_lightning import RayShardedPlugin
+from ray_lightning import RayShardedStrategy
 
 # Create your PyTorch Lightning model here.
 ptl_model = MNISTClassifier(...)
-plugin = RayShardedPlugin(num_workers=4, num_cpus_per_worker=1, use_gpu=True)
+strategy = RayShardedStrategy(num_workers=4, num_cpus_per_worker=1, use_gpu=True)
 
 # Don't set ``gpus`` in the ``Trainer``.
 # The actual number of GPUs is determined by ``num_workers``.
-trainer = pl.Trainer(..., plugins=[plugin])
+trainer = pl.Trainer(..., strategy=strategy)
 trainer.fit(ptl_model)
 ```
 See the [Pytorch Lightning docs](https://pytorch-lightning.readthedocs.io/en/stable/advanced/model_parallel.html#sharded-training) for more information on sharded training.
@@ -140,7 +141,7 @@ Example using `ray_lightning` with Tune:
 ```python
 from ray import tune
 
-from ray_lightning import RayPlugin
+from ray_lightning import RayStrategy
 from ray_lightning.examples.ray_ddp_example import MNISTClassifier
 from ray_lightning.tune import TuneReportCallback, get_tune_resources
 
@@ -159,7 +160,7 @@ def train_mnist(config):
     trainer = pl.Trainer(
         max_epochs=4,
         callbacks=callbacks,
-        plugins=[RayPlugin(num_workers=4, use_gpu=False)])
+        strategy=[RayStrategy(num_workers=4, use_gpu=False)])
     trainer.fit(model)
 
 config = {
@@ -184,26 +185,29 @@ print("Best hyperparameters found were: ", analysis.best_config)
 **Note:** Ray Tune requires 1 additional CPU per trial to use for the Trainable driver. So the actual number of resources each trial requires is `num_workers * num_cpus_per_worker + 1`.
 
 ## FAQ
-> I see that `RayPlugin` is based off of Pytorch Lightning's `DDPSpawnPlugin`. However, doesn't the PTL team discourage the use of spawn?
+> I see that `RayStrategy` is based off of Pytorch Lightning's `DDPSpawnStrategy`. However, doesn't the PTL team discourage the use of spawn?
 
 As discussed [here](https://github.com/pytorch/pytorch/issues/51688#issuecomment-773539003), using a spawn approach instead of launch is not all that detrimental. The original factors for discouraging spawn were:
 1. not being able to use 'spawn' in a Jupyter or Colab notebook, and 
 2. not being able to use multiple workers for data loading. 
 
-Neither of these should be an issue with the `RayPlugin` due to Ray's serialization mechanisms. The only thing to keep in mind is that when using this plugin, your model does have to be serializable/pickleable.
+Neither of these should be an issue with the `RayStrategy` due to Ray's serialization mechanisms. The only thing to keep in mind is that when using this strategy, your model does have to be serializable/pickleable.
+
+> Horovod installation issue
+please see [details](./docs/horovod_faq.md)
 
 <!--$UNCOMMENT## API Reference
 
 ```{eval-rst}
-.. autoclass:: ray_lightning.RayPlugin
+.. autoclass:: ray_lightning.RayStrategy
 ```
 
 ```{eval-rst}
-.. autoclass:: ray_lightning.HorovodRayPlugin
+.. autoclass:: ray_lightning.HorovodRayStrategy
 ```
 
 ```{eval-rst}
-.. autoclass:: ray_lightning.RayShardedPlugin
+.. autoclass:: ray_lightning.RayShardedStrategy
 ```
 
 
 
@@ -0,0 +1,49 @@
+# Horovod installation issue
+
+> ```
+> Extension horovod.torch has not been built: /home/ubuntu/anaconda3/envs/tensorflow2_p38/lib/python3.8/site-packages/horovod/torch/mpi_lib/_mpi_lib.cpython-38-x86_64-linux-gnu.so not found
+> If this is not expected, reinstall Horovod with HOROVOD_WITH_PYTORCH=1 to debug the build error.
+>Warning! MPI libs are missing, but python applications are still avaiable.
+> ```
+
+One might fix this issue by 
+```python
+$ pip uninstall -y horovod
+$ conda install gcc_linux-64 gxx_linux-64
+$ [flags] pip install --no-cache-dir horovod
+```
+
+from [here](https://github.com/horovod/horovod/issues/656), [here](https://github.com/tlkh/ai-lab/issues/27) and [here](https://horovod.readthedocs.io/en/stable/install_include.html)
+
+- install horovod from scratch with torch 
+
+```python
+conda create -n hd python=3.8 scipy numpy pandas -y
+conda activate hd
+conda install pytorch=1.11 torchvision torchaudio cudatoolkit=11.3 -c pytorch -y
+sudo rm -rf /usr/local/cuda
+sudo ln -s /usr/local/cuda-11.3 /usr/local/cuda
+conda install gxx_linux-64 -y
+conda install cxx-compiler=1.0 -y
+export TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0"
+echo $TORCH_CUDA_ARCH_LIST
+sudo apt-get purge -y cmake
+wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz
+tar -zxvf cmake-3.20.2.tar.gz
+cd cmake-3.20.2
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j10
+sudo make install
+cmake --version
+export CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
+export HOROVOD_NCCL_HOME=/usr/local/cuda/
+export HOROVOD_NCCL_INCLUDE=/usr/local/cuda/include
+export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}
+export HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","}
+export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""}
+export PATH=/usr/local/cuda/bin/:$PATH
+export HOROVOD_NCCL_LIB=/usr/local/cuda/lib/
+HOROVOD_NCCL_HOME=/usr/local/cuda HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_TENSORFLOW=1  HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITHOUT_GLOO=1 pip install --no-cache-dir horovod
+```
+
+[reference 1](https://stackoverflow.com/questions/54948216/usr-lib-x86-64-linux-gnu-libstdc-so-6-version-glibcxx-3-4-21-not-found-req) and [reference 2](https://github.com/horovod/horovod/issues/401) and [reference 3](https://github.com/Lightning-AI/lightning/issues/4472) and [reference 4](https://github.com/horovod/horovod/issues/2276) and [reference 5](https://github.com/Lightning-AI/lightning/blob/master/dockers/base-cuda/Dockerfile#L105-L121) and [reference 6](https://horovod.readthedocs.io/en/stable/gpus_include.html) and [reference 7](https://horovod.readthedocs.io/en/stable/conda_include.html) and [reference 8](https://github.com/horovod/horovod/issues/3545) and [reference 9](https://github.com/KAUST-CTL/horovod-gpu-data-science-project) and [reference 10](https://kose-y.github.io/blog/2017/12/installing-cuda-aware-mpi/)
@@ -1,5 +1,5 @@
-from ray_lightning.ray_ddp import RayPlugin
-from ray_lightning.ray_horovod import HorovodRayPlugin
-from ray_lightning.ray_ddp_sharded import RayShardedPlugin
+from ray_lightning.ray_ddp import RayStrategy
+from ray_lightning.ray_horovod import HorovodRayStrategy
+from ray_lightning.ray_ddp_sharded import RayShardedStrategy
 
-__all__ = ["RayPlugin", "HorovodRayPlugin", "RayShardedPlugin"]
+__all__ = ["RayStrategy", "HorovodRayStrategy", "RayShardedStrategy"]
@@ -0,0 +1,21 @@
+# Copyright The PyTorch Lightning team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from pytorch_lightning.accelerators.registry import \
+    call_register_accelerators  # noqa: F401
+from ray_lightning.accelerators.delayed_gpu_accelerator import _GPUAccelerator
+
+#  these lines are to register the delayed gpu accelerator as `_gpu`
+ACCELERATORS_BASE_MODULE = "ray_lightning.accelerators"
+call_register_accelerators(ACCELERATORS_BASE_MODULE)
+
+__all__ = ["_GPUAccelerator"]
@@ -0,0 +1,60 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List
+
+import torch
+
+from pytorch_lightning.accelerators import Accelerator,\
+    GPUAccelerator
+
+
+class _GPUAccelerator(GPUAccelerator):
+    """Accelerator for GPU devices.
+
+    adapted from:
+    https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/accelerators/gpu.py#L43
+    but remove `torch.cuda.set_device(root_device)` in `setup_environment`
+    """
+
+    def setup_environment(self, root_device: torch.device) -> None:
+        """
+        modified: remove `torch.cuda.set_device(root_device)`
+        and call `torch.cuda.set_device(self.device)` at the later time
+        inside the `ray_launcher` or `horovod_launcher`
+        """
+        Accelerator.setup_environment(self, root_device)
+
+    @staticmethod
+    def get_parallel_devices(devices: List[int]) -> List[torch.device]:
+        """Gets parallel devices for the Accelerator."""
+        # modified: return None when no devices are available
+        if devices:
+            return [torch.device("cuda", i) for i in devices]
+        else:
+            return None
+
+    @staticmethod
+    def is_available() -> bool:
+        # modified to always return True
+        return True
+
+    @classmethod
+    def register_accelerators(cls, accelerator_registry: Dict) -> None:
+        # the delayed gpu accelerator is registered as `_gpu`
+        # in the accelerator registry
+        accelerator_registry.register(
+            "_gpu",
+            cls,
+            description=f"{cls.__class__.__name__}",
+        )
@@ -11,7 +11,7 @@
 import ray
 from ray import tune
 from ray_lightning.tune import TuneReportCallback, get_tune_resources
-from ray_lightning import RayPlugin
+from ray_lightning import RayStrategy
 from ray_lightning.tests.utils import LightningMNISTClassifier
 
 
@@ -73,7 +73,7 @@ def train_mnist(config,
     trainer = pl.Trainer(
         max_epochs=num_epochs,
         callbacks=callbacks,
-        plugins=[RayPlugin(num_workers=num_workers, use_gpu=use_gpu)],
+        strategy=RayStrategy(num_workers=num_workers, use_gpu=use_gpu),
         **trainer_kwargs)
     trainer.fit(model)
 
 
@@ -10,7 +10,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning import Callback
 
-from ray_lightning import RayShardedPlugin
+from ray_lightning import RayShardedStrategy
 
 
 class CUDACallback(Callback):
@@ -53,7 +53,7 @@ def download_data():
         with FileLock(os.path.join(data_dir, ".lock")):
             MNISTDataModule(data_dir=data_dir).prepare_data()
 
-    plugin = RayShardedPlugin(
+    strategy = RayShardedStrategy(
         num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data)
 
     dm = MNISTDataModule(data_dir, batch_size=batch_size)
@@ -65,7 +65,7 @@ def download_data():
         max_epochs=max_epochs,
         precision=16 if use_gpu else 32,
         callbacks=[CUDACallback()] if use_gpu else [],
-        plugins=plugin,
+        strategy=strategy,
         max_steps=max_steps)
 
     trainer.fit(model, dm)
 
@@ -8,7 +8,7 @@
 import ray
 from ray import tune
 from ray_lightning.tune import TuneReportCallback, get_tune_resources
-from ray_lightning import RayPlugin
+from ray_lightning import RayStrategy
 from ray_lightning.tests.utils import LightningMNISTClassifier
 
 
@@ -32,12 +32,8 @@ def download_data():
         max_epochs=num_epochs,
         callbacks=callbacks,
         progress_bar_refresh_rate=0,
-        plugins=[
-            RayPlugin(
-                num_workers=num_workers,
-                use_gpu=use_gpu,
-                init_hook=download_data)
-        ])
+        strategy=RayStrategy(
+            num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data))
     dm = MNISTDataModule(
         data_dir=data_dir, num_workers=1, batch_size=config["batch_size"])
     trainer.fit(model, dm)