Fix for fractional GPU (#125)

amogkam · web-flow · commit c8bcae747191 · 2022-04-11T10:14:06.000-07:00
Closes #124. Fixes device calculation to take into account fractional GPUs. But also raises a warning advising against this in the multi-worker case as sharing GPUs across workers will often lead to failures with NCCL training. Test was run manually and passes.
diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py
@@ -4,6 +4,7 @@
 from contextlib import closing
 import os
 import socket
+import warnings
 
 import numpy as np
 import torch
@@ -137,6 +138,18 @@ def __init__(self,
             self.num_gpus_per_worker = int(use_gpu)
 
         self.use_gpu = self.num_gpus_per_worker > 0
+
+        if self.use_gpu and self.num_gpus_per_worker < 1 and num_workers > 1:
+            warnings.warn("Identified less than 1 GPU being set per worker. "
+                          "If using NCCL backend (which is the default for "
+                          "GPU training), GPU devices cannot be shared "
+                          "across processes/workers and training is likely "
+                          "to fail. It is recommended to use 1 GPU per "
+                          "worker for training, or if you must use "
+                          "fractional GPUs, then use the gloo backend by "
+                          "setting PL_TORCH_DISTRIBUTED_BACKEND=gloo "
+                          "environment variable.")
+
         self.additional_resources_per_worker = resources_per_worker
         self.workers = []
         self.init_hook = init_hook
@@ -514,8 +527,9 @@ def node_rank(self) -> int:
     def root_device(self):
         if self.use_gpu and torch.cuda.is_available():
             if self._is_remote:
-                # Adjust for if there are multiple GPUs per worker.
-                device_id = self.local_rank * self.num_gpus_per_worker
+                # Adjust to support multiple GPUs per worker or fractional
+                # GPUs per worker.
+                device_id = ray.get_gpu_ids()[0]
                 return torch.device("cuda", device_id)
             else:
                 # If the root device is requested on the driver, just return
diff --git a/ray_lightning/tests/test_ddp_gpu.py b/ray_lightning/tests/test_ddp_gpu.py
@@ -80,15 +80,36 @@ def on_epoch_end(self, trainer, pl_module):
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
-@pytest.mark.parametrize("num_gpus_per_worker", [1, 2])
-def test_correct_devices(tmpdir, ray_start_4_gpus, num_gpus_per_worker):
+    torch.cuda.device_count() < 4, reason="test requires multi-GPU machine")
+@pytest.mark.parametrize("num_gpus_per_worker", [0.4, 0.5, 1, 2])
+def test_correct_devices(tmpdir, ray_start_4_gpus, num_gpus_per_worker,
+                         monkeypatch):
     """Tests if GPU devices are correctly set."""
     model = BoringModel()
 
+    if num_gpus_per_worker < 1:
+        monkeypatch.setenv("PL_TORCH_DISTRIBUTED_BACKEND", "gloo")
+
+    def get_gpu_placement(current_worker_index, num_gpus_per_worker):
+        """Simulates GPU resource bin packing."""
+        next_gpu_index = 0
+        starting_resource_count = num_gpus_per_worker
+        for _ in range(current_worker_index + 1):
+            current_gpu_index = next_gpu_index
+            next_resources = starting_resource_count + \
+                num_gpus_per_worker - 0.0001
+            # If the next worker cannot fit on the current GPU, then we move
+            # onto the next GPU.
+            if int(next_resources) != current_gpu_index:
+                increment = max(1, int(num_gpus_per_worker))
+                next_gpu_index = current_gpu_index + increment
+
+        return current_gpu_index
+
     class CheckDevicesCallback(Callback):
         def on_epoch_end(self, trainer, pl_module):
-            assert trainer.root_gpu == trainer.local_rank * num_gpus_per_worker
+            assert trainer.root_gpu == get_gpu_placement(
+                trainer.local_rank, num_gpus_per_worker)
             assert trainer.root_gpu == pl_module.device.index
             assert torch.cuda.current_device() == trainer.root_gpu