|
4 | 4 | # This source code is licensed under the BSD-style license found in the |
5 | 5 | # LICENSE file in the root directory of this source tree. |
6 | 6 |
|
| 7 | +from forge.env import MONARCH_HOSTMESH_V1 |
| 8 | + |
7 | 9 | """Remote and local resource manager for allocation and provisioning.""" |
8 | 10 | import asyncio |
9 | 11 | import functools |
|
14 | 16 | import uuid |
15 | 17 |
|
16 | 18 | from monarch._src.actor.shape import NDSlice, Shape |
17 | | -from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host |
| 19 | +from monarch.actor import Actor, endpoint, ProcMesh |
| 20 | + |
18 | 21 | from monarch.tools import commands |
19 | 22 |
|
20 | 23 | from forge.controller.launcher import BaseLauncher, get_launcher |
|
27 | 30 | logger.setLevel(logging.DEBUG) |
28 | 31 |
|
29 | 32 |
|
| 33 | +if MONARCH_HOSTMESH_V1.get_value(): |
| 34 | + from monarch._src.actor.v1.host_mesh import HostMesh, this_host |
| 35 | + |
| 36 | + logger.info("Using Monarch HostMesh v1...") |
| 37 | +else: |
| 38 | + from monarch.actor import HostMesh, this_host |
| 39 | + |
| 40 | + |
30 | 41 | def _get_port() -> str: |
31 | 42 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: |
32 | 43 | s.bind(("localhost", 0)) |
@@ -148,14 +159,29 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh: |
148 | 159 | alloc, alloc_constraints, server_name = await self.launcher.get_allocator( |
149 | 160 | name, num_hosts |
150 | 161 | ) |
151 | | - return ( |
152 | | - HostMesh( |
| 162 | + |
| 163 | + if MONARCH_HOSTMESH_V1.get_value(): |
| 164 | + # We are asking Monarch to allocate a single process on |
| 165 | + # every host, reflected in the Extent we provide below. |
| 166 | + |
| 167 | + # Technically, this is ["hosts", "procs"] but to reduce |
| 168 | + # confusion on its relationship with procs elsewhere, |
| 169 | + # we call it "no_dim". |
| 170 | + |
| 171 | + # TODO - remove this once Monarch supports HostMesh without it. |
| 172 | + host_mesh = HostMesh.allocate_nonblocking( |
| 173 | + name=name, |
| 174 | + extent=Extent(["hosts", "no_dim"], [num_hosts, 1]), |
| 175 | + allocator=alloc, |
| 176 | + alloc_constraints=alloc_constraints, |
| 177 | + ) |
| 178 | + else: |
| 179 | + host_mesh = HostMesh( |
153 | 180 | Shape(["hosts"], NDSlice.new_row_major([num_hosts])), |
154 | 181 | allocator=alloc, |
155 | 182 | alloc_constraints=alloc_constraints, |
156 | | - ), |
157 | | - server_name, |
158 | | - ) |
| 183 | + ) |
| 184 | + return host_mesh, server_name |
159 | 185 |
|
160 | 186 | async def get_proc_mesh( |
161 | 187 | self, |
@@ -256,7 +282,7 @@ def bootstrap(env: dict[str, str]): |
256 | 282 | env_vars[env_var.name] = str(env_var.get_value()) |
257 | 283 |
|
258 | 284 | procs = host_mesh.spawn_procs( |
259 | | - per_host={"gpus": num_procs}, |
| 285 | + per_host={"procs": num_procs}, |
260 | 286 | bootstrap=functools.partial(bootstrap, env=env_vars), |
261 | 287 | ) |
262 | 288 |
|
|
0 commit comments