1111
1212import os
1313import socket
14+ from functools import partial
1415
1516from monarch .actor import proc_mesh , ProcMesh
1617from monarch .tools import commands
1718from monarch .tools .config import Config
1819from omegaconf import DictConfig
1920
2021from forge .controller import ForgeActor
22+
23+ from forge .controller .system_controllers .gpu_manager import get_gpu_ids , release_gpus
2124from forge .types import ProcessConfig
2225
2326logger : logging .Logger = logging .getLogger (__name__ )
@@ -48,27 +51,52 @@ async def spawn_actors(
4851 set_address : bool = False ,
4952):
5053 """Setup process Mesh and spawn Actors."""
51- mesh = await get_proc_mesh (processes , set_address )
54+ mesh = await get_proc_mesh (processes )
5255 actors = await mesh .spawn (name , actor_cls , ** cfg )
5356 actors .mesh = mesh
5457 return actors
5558
5659
57- async def get_proc_mesh (process_config : ProcessConfig , set_address = False ) -> ProcMesh :
58- env = None
59- if set_address :
60- env = {
61- "MASTER_ADDR" : str (socket .gethostname ()),
62- "MASTER_PORT" : str (_find_free_port ()),
63- }
60+ async def get_proc_mesh (process_config : ProcessConfig ) -> ProcMesh :
61+ """Returns a proc mesh with the given process config."""
62+ # TODO - modify this to work with multi-host
63+ env = {
64+ "MASTER_ADDR" : str (socket .gethostname ()),
65+ "MASTER_PORT" : str (_find_free_port ()),
66+ }
67+ gpu_ids = None
68+
69+ def _setup_env (env : dict [str , str ]):
70+ """Sets up the environment on proc mesh creation."""
71+ for k , v in env .items ():
72+ os .environ [k ] = v
73+
6474 if process_config .scheduler == "local" :
6575 if process_config .num_hosts != 1 :
6676 raise ValueError ("Local scheduler only supports 1 host" )
67- return await proc_mesh (gpus = process_config .num_procs , env = env )
77+
78+ if process_config .with_gpus :
79+ gpu_ids = await get_gpu_ids (process_config .num_procs )
80+ env ["CUDA_VISIBLE_DEVICES" ] = "," .join (map (str , gpu_ids ))
81+
82+ # TODO - update to use this_host() whenever it supports
83+ # being run within actors:
84+ # AttributeError: NYI: attempting to get ProcMesh attribute `slice` on object that's
85+ # actually a ProcMeshRef
86+ # return this_host().spawn_procs(
87+ # per_host={"procs": process_config.num_procs},
88+ # bootstrap=partial(_setup_env, env=env),
89+ # )
90+ m = proc_mesh (gpus = process_config .num_procs , env = env )
91+ m ._gpu_ids = gpu_ids
92+ return m
6893 elif process_config .scheduler == "mast" :
6994 if not MAST_SUPPORTED :
7095 raise ValueError ("MAST is not supported on this platform" )
7196
97+ if process_config .with_gpus :
98+ raise ValueError ("NYI - need to add HostMesh tracking in GpuManager" )
99+
72100 logging .info ("Scheduling on MAST with: " , process_config )
73101 jobname = f"monarch-{ getpass .getuser ()} "
74102 config = Config (
@@ -104,12 +132,7 @@ async def get_proc_mesh(process_config: ProcessConfig, set_address=False) -> Pro
104132 )
105133 alloc = await allocator .allocate (AllocSpec (constraints , ** mesh_dimensions ))
106134 if env :
107-
108- def setup (): # noqa: FB811
109- for k , v in env .items ():
110- os .environ [k ] = v
111-
112- p = await ProcMesh .from_alloc (alloc , setup = setup )
135+ p = await ProcMesh .from_alloc (alloc , setup = partial (_setup_env , env = env ))
113136 else :
114137 p = await ProcMesh .from_alloc (alloc )
115138 await p .logging_option (stream_to_client = True , aggregate_window_sec = 3 )
@@ -118,6 +141,15 @@ def setup(): # noqa: FB811
118141 raise ValueError ("Unsupported scheduler: {}" .format (process_config .scheduler ))
119142
120143
144+ async def stop_proc_mesh (mesh : ProcMesh ) -> None :
145+ """Stops the given proc mesh."""
146+ if hasattr (mesh , "_gpu_ids" ) and mesh ._gpu_ids is not None :
147+ gpu_ids = mesh ._gpu_ids
148+ logger .debug ("Releasing GPUs: %s" , gpu_ids )
149+ await release_gpus (gpu_ids )
150+ await mesh .stop ()
151+
152+
121153def _find_free_port () -> int :
122154 with socket .socket (socket .AF_INET , socket .SOCK_STREAM ) as s :
123155 s .bind (("localhost" , 0 ))
0 commit comments