Skip to content

Commit 4d292cf

Browse files
authored
Enables CUDA_VISIBLE_DEVICES for local dev (#208)
* enables CUDA_VISIBLE_DEVICES for local dev * remove test * another one
1 parent 674e174 commit 4d292cf

File tree

2 files changed

+273
-3
lines changed

2 files changed

+273
-3
lines changed

src/forge/controller/provisioner.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import asyncio
99
import functools
1010
import logging
11+
12+
import os
1113
import socket
1214
import uuid
1315

@@ -47,8 +49,12 @@ class GpuManager:
4749
4850
"""
4951

50-
def __init__(self):
51-
self.available_gpus = set(range(0, 8))
52+
def __init__(self, available_devices: set[int] | None = None):
53+
if available_devices is None:
54+
available_devices = set(range(0, 8))
55+
assert all(isinstance(x, int) for x in available_devices)
56+
assert all(x >= 0 and x < 8 for x in available_devices)
57+
self.available_gpus = available_devices
5258

5359
def get_available_gpus(self) -> list[str]:
5460
"""Returns a list of available GPU devices."""
@@ -80,8 +86,25 @@ def __init__(self):
8086
# we generate a hash per HostMesh. We'll
8187
# remove this once this is supported in Monarch.
8288
self._this_host_id = uuid.uuid1()
89+
90+
# For the local host, we may want to set CUDA_VISIBLE_DEVICES
91+
# for small scale testing. We inherit the environment's
92+
# CUDA_VISIBLE_DEVICES **only for the local host** and not
93+
# for remote hosts.
94+
available_local_devices = None
95+
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
96+
if cuda_visible_devices is not None and cuda_visible_devices.strip():
97+
try:
98+
available_local_devices = set(
99+
int(x.strip()) for x in cuda_visible_devices.split(",") if x.strip()
100+
)
101+
except ValueError as e:
102+
raise ValueError(
103+
f"Invalid CUDA_VISIBLE_DEVICES format: '{cuda_visible_devices}'. "
104+
f"Expected comma-separated integers (e.g., '0,1,2'). Error: {e}"
105+
) from e
83106
self._host_gpu_map = {
84-
self._this_host_id: GpuManager(),
107+
self._this_host_id: GpuManager(available_local_devices),
85108
}
86109

87110
async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
"""Tests for Provisioner CUDA_VISIBLE_DEVICES functionality."""
8+
9+
import os
10+
from unittest import mock
11+
12+
import pytest
13+
from forge.controller.provisioner import GpuManager, Provisioner
14+
from forge.types import ProcessConfig
15+
16+
17+
class TestGpuManagerCudaVisibleDevices:
18+
"""Test GpuManager with different CUDA_VISIBLE_DEVICES configurations."""
19+
20+
def test_gpu_manager_default_initialization(self):
21+
"""Test GpuManager initializes with default 8 GPUs when no specific devices provided."""
22+
manager = GpuManager()
23+
available = manager.get_available_gpus()
24+
assert available == [str(i) for i in range(8)]
25+
assert len(available) == 8
26+
27+
def test_gpu_manager_custom_devices(self):
28+
"""Test GpuManager with specific available devices."""
29+
custom_devices = {0, 2, 4, 6}
30+
manager = GpuManager(available_devices=custom_devices)
31+
available = manager.get_available_gpus()
32+
expected = ["0", "2", "4", "6"]
33+
assert sorted(available) == sorted(expected)
34+
assert len(available) == 4
35+
36+
def test_gpu_manager_empty_devices(self):
37+
"""Test GpuManager with no available devices."""
38+
empty_devices = set()
39+
manager = GpuManager(available_devices=empty_devices)
40+
available = manager.get_available_gpus()
41+
assert available == []
42+
assert len(available) == 0
43+
44+
def test_gpu_manager_invalid_device_range(self):
45+
"""Test GpuManager validation of device ranges."""
46+
with pytest.raises(AssertionError):
47+
GpuManager(available_devices={-1}) # Negative device
48+
49+
with pytest.raises(AssertionError):
50+
GpuManager(available_devices={8}) # Device >= 8
51+
52+
with pytest.raises(AssertionError):
53+
GpuManager(available_devices={"0"}) # String instead of int
54+
55+
def test_gpu_allocation_with_custom_devices(self):
56+
"""Test GPU allocation with custom device set."""
57+
custom_devices = {1, 3, 5}
58+
manager = GpuManager(available_devices=custom_devices)
59+
60+
# Get 2 GPUs
61+
allocated = manager.get_gpus(2)
62+
assert len(allocated) == 2
63+
assert all(gpu in ["1", "3", "5"] for gpu in allocated)
64+
65+
# Check remaining
66+
remaining = manager.get_available_gpus()
67+
assert len(remaining) == 1
68+
69+
# Total allocated + remaining should equal original
70+
all_gpus = set(allocated + remaining)
71+
assert all_gpus == {"1", "3", "5"}
72+
73+
def test_gpu_release_with_custom_devices(self):
74+
"""Test GPU release with custom device set."""
75+
custom_devices = {2, 4, 7}
76+
manager = GpuManager(available_devices=custom_devices)
77+
78+
# Allocate all
79+
allocated = manager.get_gpus(3)
80+
assert len(allocated) == 3
81+
assert manager.get_available_gpus() == []
82+
83+
# Release some
84+
manager.release_gpus([allocated[0]])
85+
remaining = manager.get_available_gpus()
86+
assert len(remaining) == 1
87+
assert remaining[0] == allocated[0]
88+
89+
90+
class TestProvisionerCudaVisibleDevices:
91+
"""Test Provisioner's handling of CUDA_VISIBLE_DEVICES environment variable."""
92+
93+
@mock.patch.dict(os.environ, {}, clear=True)
94+
def test_provisioner_no_cuda_visible_devices(self):
95+
"""Test Provisioner when CUDA_VISIBLE_DEVICES is not set."""
96+
provisioner = Provisioner()
97+
98+
# Should have default GpuManager for local host
99+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
100+
available = local_gpu_manager.get_available_gpus()
101+
assert available == [str(i) for i in range(8)]
102+
103+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, clear=True)
104+
def test_provisioner_with_cuda_visible_devices(self):
105+
"""Test Provisioner with CUDA_VISIBLE_DEVICES set."""
106+
provisioner = Provisioner()
107+
108+
# Should have GpuManager configured with specified devices
109+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
110+
available = local_gpu_manager.get_available_gpus()
111+
expected = ["0", "1", "2", "3"]
112+
assert sorted(available) == sorted(expected)
113+
assert len(available) == 4
114+
115+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,2,5,7"}, clear=True)
116+
def test_provisioner_non_contiguous_gpus(self):
117+
"""Test Provisioner with non-contiguous GPU IDs."""
118+
provisioner = Provisioner()
119+
120+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
121+
available = local_gpu_manager.get_available_gpus()
122+
expected = ["0", "2", "5", "7"]
123+
assert sorted(available) == sorted(expected)
124+
assert len(available) == 4
125+
126+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "3,1,4,1"}, clear=True)
127+
def test_provisioner_duplicate_gpu_ids(self):
128+
"""Test Provisioner handles duplicate GPU IDs in CUDA_VISIBLE_DEVICES."""
129+
provisioner = Provisioner()
130+
131+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
132+
available = local_gpu_manager.get_available_gpus()
133+
# Should deduplicate: {3, 1, 4}
134+
expected = ["1", "3", "4"]
135+
assert sorted(available) == sorted(expected)
136+
assert len(available) == 3
137+
138+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": ""}, clear=True)
139+
def test_provisioner_empty_cuda_visible_devices(self):
140+
"""Test Provisioner with empty CUDA_VISIBLE_DEVICES."""
141+
provisioner = Provisioner()
142+
143+
# Empty string should result in default behavior (no devices specified)
144+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
145+
available = local_gpu_manager.get_available_gpus()
146+
assert available == [str(i) for i in range(8)]
147+
148+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1,2"}, clear=True)
149+
@pytest.mark.asyncio
150+
async def test_get_proc_mesh_respects_cuda_visible_devices(self):
151+
"""Test that get_proc_mesh uses CUDA_VISIBLE_DEVICES for local allocation."""
152+
provisioner = Provisioner()
153+
154+
# Verify initial state
155+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
156+
initial_available = local_gpu_manager.get_available_gpus()
157+
assert sorted(initial_available) == ["0", "1", "2"]
158+
159+
# Note - this can run even on CPU because with_gpus just sets environment
160+
# variables.
161+
config = ProcessConfig(num_procs=2, with_gpus=True, num_hosts=None)
162+
_ = await provisioner.get_proc_mesh(
163+
num_procs=config.num_procs,
164+
with_gpus=config.with_gpus,
165+
num_hosts=config.num_hosts,
166+
)
167+
# Verify GPUs were allocated from available set
168+
remaining_available = local_gpu_manager.get_available_gpus()
169+
assert len(remaining_available) == 1 # Started with 3, allocated 2
170+
171+
172+
class TestProvisionerEnvironmentIsolation:
173+
"""Test that CUDA_VISIBLE_DEVICES only affects local host, not remote hosts."""
174+
175+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}, clear=True)
176+
def test_remote_host_ignores_cuda_visible_devices(self):
177+
"""Test that remote hosts get default GPU configuration."""
178+
provisioner = Provisioner()
179+
180+
# Local host should respect CUDA_VISIBLE_DEVICES
181+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
182+
local_available = local_gpu_manager.get_available_gpus()
183+
assert sorted(local_available) == ["0", "1"]
184+
185+
# When creating remote allocations, they should get default GPU sets
186+
# This is verified by checking that remote allocations create new GpuManager
187+
# instances without the available_devices parameter (line 154 in provisioner.py)
188+
assert len(provisioner._host_gpu_map) == 1 # Only local host initially
189+
190+
# The remote host creation in create_host_mesh creates GpuManager()
191+
# without available_devices parameter, so it gets default 8 GPUs
192+
193+
194+
class TestIntegrationScenarios:
195+
"""Integration test scenarios for CUDA_VISIBLE_DEVICES functionality."""
196+
197+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "1,3"}, clear=True)
198+
def test_full_allocation_cycle(self):
199+
"""Test complete allocation and release cycle with CUDA_VISIBLE_DEVICES."""
200+
provisioner = Provisioner()
201+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
202+
203+
# Initial state
204+
assert sorted(local_gpu_manager.get_available_gpus()) == ["1", "3"]
205+
206+
# Allocate all available GPUs
207+
allocated = local_gpu_manager.get_gpus(2)
208+
assert len(allocated) == 2
209+
assert sorted(allocated) == ["1", "3"]
210+
assert local_gpu_manager.get_available_gpus() == []
211+
212+
# Try to allocate more - should fail
213+
with pytest.raises(RuntimeError, match="Not enough GPUs available"):
214+
local_gpu_manager.get_gpus(1)
215+
216+
# Release some GPUs
217+
local_gpu_manager.release_gpus([allocated[0]])
218+
remaining = local_gpu_manager.get_available_gpus()
219+
assert len(remaining) == 1
220+
assert remaining[0] == allocated[0]
221+
222+
# Release all GPUs
223+
local_gpu_manager.release_gpus([allocated[1]])
224+
final_available = local_gpu_manager.get_available_gpus()
225+
assert sorted(final_available) == ["1", "3"]
226+
227+
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}, clear=True)
228+
def test_single_gpu_scenario(self):
229+
"""Test scenario with only one GPU available."""
230+
provisioner = Provisioner()
231+
local_gpu_manager = provisioner._host_gpu_map[provisioner._this_host_id]
232+
233+
# Should have only GPU 0
234+
assert local_gpu_manager.get_available_gpus() == ["0"]
235+
236+
# Allocate the single GPU
237+
allocated = local_gpu_manager.get_gpus(1)
238+
assert allocated == ["0"]
239+
assert local_gpu_manager.get_available_gpus() == []
240+
241+
# Should fail to allocate any more
242+
with pytest.raises(RuntimeError):
243+
local_gpu_manager.get_gpus(1)
244+
245+
# Release and verify
246+
local_gpu_manager.release_gpus(allocated)
247+
assert local_gpu_manager.get_available_gpus() == ["0"]

0 commit comments

Comments
 (0)