gpu manager uses devices

Allen Wang · Allen Wang · commit 8e2c06bc7c4d · 2025-11-06T12:29:42.000-08:00
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -12,6 +12,8 @@
 import socket
 import uuid
 
+import torch
+
 from monarch._src.actor.actor_mesh import ActorMesh
 from monarch._src.actor.shape import Extent
 
@@ -40,8 +42,14 @@ class _RemoteInfoFetcher(Actor):
     """An actor responsible for getting remote host information."""
 
     @endpoint
-    def get_info(self) -> tuple[str, str]:
-        return socket.gethostname(), _get_port()
+    def get_info(self) -> tuple[str, str, int]:
+        """Returns hostname, port, and GPU count."""
+        try:
+            gpu_count = torch.cuda.device_count()
+        except Exception:
+            # If torch is not available or CUDA is not available, assume no GPUs
+            gpu_count = 0
+        return socket.gethostname(), _get_port(), gpu_count
 
 
 class EnvSetter(Actor):
@@ -77,8 +85,8 @@ def set_env(self, env_vars: dict[str, str]):
             os.environ[k] = v
 
 
-async def get_remote_info(host_mesh: HostMesh) -> tuple[str, str]:
-    """Returns the host name and port of the host mesh."""
+async def get_host_info(host_mesh: HostMesh) -> tuple[str, str, int]:
+    """Returns the host name, port, and GPU count of the host mesh."""
     throwaway_procs = host_mesh.spawn_procs(per_host={"procs": 1})
     fetcher = throwaway_procs.spawn("_fetcher", _RemoteInfoFetcher)
 
@@ -88,11 +96,11 @@ async def get_remote_info(host_mesh: HostMesh) -> tuple[str, str]:
     fetcher = fetcher.slice(**singleton_slice)
     # Fetcher should be a singleton at this point - call_one() will fail otherwise
 
-    host, port = await fetcher.get_info.call_one()
+    host, port, gpu_count = await fetcher.get_info.call_one()
 
     # Stopping this proc is the right thing to do, but Monarch does not yet handle manual stops well.
     # await throwaway_procs.stop()
-    return host, port
+    return host, port, gpu_count
 
 
 async def set_environment(proc_mesh: ProcMesh, env_vars: dict[str, str]):
@@ -110,14 +118,37 @@ async def set_environment(proc_mesh: ProcMesh, env_vars: dict[str, str]):
 
 
 class GpuManager:
-    """Tracks and assigns GPU devices on a host."""
+    """Tracks and assigns GPU devices on a host.
+
+    Args:
+        available_devices: Set of GPU device IDs to manage. If None, uses all devices from 0 to max_device_count-1.
+        max_device_count: Maximum number of GPU devices on this host. Defaults to 8.
+
+    """
 
-    def __init__(self, available_devices: set[int] | None = None):
+    def __init__(
+        self, available_devices: set[int] | None = None, max_device_count: int = 8
+    ):
         if available_devices is None:
-            available_devices = set(range(0, 8))
-        assert all(isinstance(x, int) for x in available_devices)
-        assert all(x >= 0 and x < 8 for x in available_devices)
+            available_devices = set(range(0, max_device_count))
+        else:
+            # Validate types first
+            assert all(
+                isinstance(x, int) for x in available_devices
+            ), f"All device IDs must be integers, got: {available_devices}"
+            # When available_devices is provided (e.g., from CUDA_VISIBLE_DEVICES),
+            # adjust max_device_count to accommodate the highest device ID
+            if available_devices:
+                max_device_count = max(max(available_devices) + 1, max_device_count)
+
+        assert all(
+            isinstance(x, int) for x in available_devices
+        ), f"All device IDs must be integers, got: {available_devices}"
+        assert all(
+            x >= 0 for x in available_devices
+        ), f"All device IDs must be non-negative, got: {available_devices}"
         self.available_gpus = available_devices
+        self.max_device_count = max_device_count
 
     def get_available_gpus(self) -> list[str]:
         """Returns a list of available GPU devices."""
@@ -166,8 +197,18 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
                     f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
                     f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
                 ) from e
+
+        # Get the actual GPU count for the local host
+        try:
+            local_gpu_count = torch.cuda.device_count()
+        except Exception:
+            # If torch is not available or CUDA is not available, assume no GPUs
+            local_gpu_count = 0
+
         self._host_gpu_map = {
-            self._this_host_id: GpuManager(available_local_devices),
+            self._this_host_id: GpuManager(
+                available_local_devices, max_device_count=local_gpu_count
+            ),
         }
         self._proc_host_map = {}
         self._host_mesh_map = {}
@@ -272,9 +313,18 @@ async def get_proc_mesh(
                         num_hosts=num_hosts,
                     )
                     host_id = uuid.uuid1()
-                    gpu_manager = GpuManager()
+                    # Get host info including GPU count from the remote host
+                    host_addr, host_port, remote_gpu_count = await get_host_info(
+                        host_mesh
+                    )
+                    gpu_manager = GpuManager(max_device_count=remote_gpu_count)
                     self._host_gpu_map[host_id] = gpu_manager
                     host_mesh._host_id = host_id
+                    # Use the fetched addr/port if not explicitly provided
+                    if addr is None:
+                        addr = host_addr
+                    if port is None:
+                        port = host_port
                 else:
                     host_id = host_mesh._host_id
                     gpu_manager = self._host_gpu_map[host_id]
@@ -286,7 +336,7 @@ async def get_proc_mesh(
 
             if with_gpus:
                 if not addr or not port:
-                    addr, port = await get_remote_info(host_mesh)
+                    addr, port, _ = await get_host_info(host_mesh)
                 gpu_ids = gpu_manager.get_gpus(num_procs)
 
                 env_vars["MASTER_ADDR"] = addr
diff --git a/tests/unit_tests/test_provisioner.py b/tests/unit_tests/test_provisioner.py
@@ -45,9 +45,6 @@ def test_gpu_manager_invalid_device_range(self):
         with pytest.raises(AssertionError):
             GpuManager(available_devices={-1})  # Negative device
 
-        with pytest.raises(AssertionError):
-            GpuManager(available_devices={8})  # Device >= 8
-
         with pytest.raises(AssertionError):
             GpuManager(available_devices={"0"})  # String instead of int
 
@@ -90,7 +87,8 @@ class TestProvisionerCudaVisibleDevices:
     """Test Provisioner's handling of CUDA_VISIBLE_DEVICES environment variable."""
 
     @mock.patch.dict(os.environ, {}, clear=True)
-    def test_provisioner_no_cuda_visible_devices(self):
+    @mock.patch("torch.cuda.device_count", return_value=8)
+    def test_provisioner_no_cuda_visible_devices(self, mock_device_count):
         """Test Provisioner when CUDA_VISIBLE_DEVICES is not set."""
         provisioner = Provisioner()
 
@@ -135,7 +133,8 @@ def test_provisioner_duplicate_gpu_ids(self):
         assert len(available) == 3
 
     @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": ""}, clear=True)
-    def test_provisioner_empty_cuda_visible_devices(self):
+    @mock.patch("torch.cuda.device_count", return_value=8)
+    def test_provisioner_empty_cuda_visible_devices(self, mock_device_count):
         """Test Provisioner with empty CUDA_VISIBLE_DEVICES."""
         provisioner = Provisioner()
 
@@ -245,3 +244,47 @@ def test_single_gpu_scenario(self):
         # Release and verify
         local_gpu_manager.release_gpus(allocated)
         assert local_gpu_manager.get_available_gpus() == ["0"]
+
+
+class TestDynamicGpuDetection:
+    """Test dynamic GPU detection using torch.cuda.device_count()."""
+
+    @mock.patch.dict(os.environ, {}, clear=True)
+    @mock.patch("torch.cuda.device_count", return_value=4)
+    def test_provisioner_with_4_gpus(self, mock_device_count):
+        """Test Provisioner detects 4 GPUs when torch.cuda.device_count() returns 4."""
+        provisioner = Provisioner()
+
+        local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
+        available = local_gpu_manager.get_available_gpus()
+        assert sorted(available) == ["0", "1", "2", "3"]
+        assert len(available) == 4
+        assert local_gpu_manager.max_device_count == 4
+
+    @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,2,4"}, clear=True)
+    @mock.patch("torch.cuda.device_count", return_value=8)
+    def test_cuda_visible_devices_with_detected_gpus(self, mock_device_count):
+        """Test that CUDA_VISIBLE_DEVICES works correctly with detected GPU count."""
+        provisioner = Provisioner()
+
+        local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
+        available = local_gpu_manager.get_available_gpus()
+        # Should use CUDA_VISIBLE_DEVICES, not all 8 detected GPUs
+        assert sorted(available) == ["0", "2", "4"]
+        assert len(available) == 3
+        # max_device_count should still be 8 from detection
+        assert local_gpu_manager.max_device_count == 8
+
+    @mock.patch.dict(os.environ, {}, clear=True)
+    @mock.patch(
+        "torch.cuda.device_count", side_effect=RuntimeError("CUDA not available")
+    )
+    def test_provisioner_when_cuda_unavailable(self, mock_device_count):
+        """Test Provisioner defaults to 0 GPUs when CUDA is not available."""
+        provisioner = Provisioner()
+
+        local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
+        available = local_gpu_manager.get_available_gpus()
+        assert available == []
+        assert len(available) == 0
+        assert local_gpu_manager.max_device_count == 0