Skip to content

Commit cd0e0e3

Browse files
zdevitometa-codesync[bot]
authored andcommitted
envvar to force v1 by default (#1463)
Summary: Pull Request resolved: #1463 Having monarch.actor selectively import from the same parts. This fixes the public API. Internal uses will probably need to call the add create_host_mesh_from_alloc which will give the right kind of host mesh based on environment variable. `MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE` is the name of the flag. Chosen because this needs to be gone before the public release. ghstack-source-id: 314898148 Reviewed By: mariusae Differential Revision: D84107651 fbshipit-source-id: e924f2ae172ca6de02ff5f9d04c42f9d038d6048
1 parent e6e1cea commit cd0e0e3

File tree

6 files changed

+67
-56
lines changed

6 files changed

+67
-56
lines changed

python/monarch/_src/actor/actor_mesh.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ def context() -> Context:
237237
_transport_lock = threading.Lock()
238238

239239

240-
def enable_transport(transport: ChannelTransport) -> None:
240+
def enable_transport(transport: "ChannelTransport | str") -> None:
241241
"""
242242
Allow monarch to communicate with transport type 'transport'
243243
This must be called before any other calls in the monarch API.
@@ -247,6 +247,15 @@ def enable_transport(transport: ChannelTransport) -> None:
247247
Currently only one transport type may be enabled at one time.
248248
In the future we may allow multiple to be enabled.
249249
"""
250+
if isinstance(transport, str):
251+
transport = {
252+
"tcp": ChannelTransport.Tcp,
253+
"ipc": ChannelTransport.Unix,
254+
"metatls": ChannelTransport.MetaTlsWithIpV6,
255+
}.get(transport)
256+
if transport is None:
257+
raise ValueError(f"unknown transport: {transport}")
258+
250259
if _context.get(None) is not None:
251260
raise RuntimeError(
252261
"`enable_transport()` must be called before any other calls in the monarch API. "

python/monarch/_src/actor/v1/__init__.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,46 @@
55
# LICENSE file in the root directory of this source tree.
66

77
# pyre-unsafe
8+
import os
9+
from typing import TYPE_CHECKING
10+
11+
from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints
12+
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
13+
14+
from monarch._src.actor.allocator import AllocateMixin
15+
16+
from monarch._src.actor.endpoint import Extent
17+
from monarch._src.actor.host_mesh import HostMesh as HostMeshV0
18+
from monarch._src.actor.v1.host_mesh import HostMesh as HostMeshV1
19+
20+
enabled = os.environ.get("MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE", "0") != "0"
21+
22+
if TYPE_CHECKING or not enabled:
23+
from monarch._src.actor.host_mesh import HostMesh, this_host, this_proc
24+
from monarch._src.actor.proc_mesh import get_or_spawn_controller, ProcMesh
25+
else:
26+
from monarch._src.actor.v1.host_mesh import HostMesh, this_host, this_proc
27+
from monarch._src.actor.v1.proc_mesh import get_or_spawn_controller, ProcMesh
28+
29+
30+
def host_mesh_from_alloc(
31+
name: str, extent: Extent, allocator: AllocateMixin, constraints: AllocConstraints
32+
) -> "HostMeshV0 | HostMeshV1":
33+
if enabled:
34+
return HostMeshV1.allocate_nonblocking(name, extent, allocator, constraints)
35+
else:
36+
return HostMeshV0(
37+
Shape(extent.labels, Slice.new_row_major(extent.sizes)),
38+
allocator,
39+
constraints,
40+
)
41+
42+
43+
__all__ = [
44+
"HostMesh",
45+
"this_host",
46+
"this_proc",
47+
"get_or_spawn_controller",
48+
"ProcMesh",
49+
"host_mesh_from_alloc",
50+
]

python/monarch/_src/job/job.py

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,12 @@
1616
from typing import cast, Dict, List, Literal, NamedTuple, Optional, Sequence
1717

1818
from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
19-
from monarch._rust_bindings.monarch_hyperactor.config import configure
2019

2120
from monarch._src.actor.bootstrap import attach_to_workers
2221

2322
# note: the jobs api is intended as a library so it should
2423
# only be importing _public_ monarch API functions.
25-
from monarch._src.actor.host_mesh import HostMesh, this_host
26-
27-
from typing_extensions import Self
24+
from monarch.actor import enable_transport, HostMesh, this_host
2825

2926

3027
class JobState:
@@ -441,47 +438,14 @@ def _kill(self):
441438
pass
442439

443440

444-
class FakeLocalLoginJob(LoginJob):
445-
"""
446-
447-
Fake it that we are logging in by just making a local process that runs the bootstrap.
448-
"""
449-
450-
def __init__(self):
451-
super().__init__()
452-
configure(default_transport=ChannelTransport.Tcp)
453-
454-
self._next_port = 12345
455-
456-
def _start_host(self, host: str) -> ProcessState:
457-
port = self._next_port
458-
self._next_port += 1
459-
460-
env = {**os.environ}
461-
if "FB_XAR_INVOKED_NAME" in os.environ:
462-
env["PYTHONPATH"] = ":".join(sys.path)
463-
addr = f"tcp://[::1]:{port}"
464-
bind_addr = f"tcp://[::1]:{port}"
465-
proc = subprocess.Popen(
466-
[
467-
sys.executable,
468-
"-c",
469-
f'from monarch.actor import run_worker_loop_forever; run_worker_loop_forever(address={repr(bind_addr)}, ca="trust_all_connections")',
470-
],
471-
env=env,
472-
start_new_session=True,
473-
)
474-
return ProcessState(proc.pid, addr)
475-
476-
477441
class SSHJob(LoginJob):
478442
def __init__(
479443
self,
480444
python_exe: str = "python",
481445
ssh_args: Sequence[str] = (),
482446
monarch_port: int = 22222,
483447
):
484-
configure(default_transport=ChannelTransport.Tcp)
448+
enable_transport("tcp")
485449
self._python_exe = python_exe
486450
self._ssh_args = ssh_args
487451
self._port = monarch_port

python/monarch/_src/job/meta.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
)
2020

2121
from monarch._rust_bindings.monarch_hyperactor.pytokio import PythonTask, Shared
22-
from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
22+
from monarch._rust_bindings.monarch_hyperactor.shape import Extent
2323
from monarch._src.actor.allocator import AllocateMixin
24-
from monarch._src.actor.host_mesh import HostMesh
2524
from monarch._src.actor.meta.allocator import (
2625
MastAllocator,
2726
MastAllocatorBase,
2827
MastAllocatorConfig,
2928
)
29+
from monarch._src.actor.v1 import host_mesh_from_alloc
3030

3131
from monarch._src.job.job import BatchJob, JobState, JobTrait
3232

@@ -173,10 +173,8 @@ def _state(self) -> JobState:
173173
job_started,
174174
)
175175
constraints = AllocConstraints({MastAllocator.ALLOC_LABEL_TASK_GROUP: name})
176-
host_meshes[name] = HostMesh(
177-
Shape(["hosts"], Slice.new_row_major([num_host])),
178-
allocator,
179-
constraints,
176+
host_meshes[name] = host_mesh_from_alloc(
177+
name, Extent(["hosts"], [num_host]), allocator, constraints
180178
)
181179

182180
return JobState(host_meshes)

python/monarch/_src/rdma/rdma.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from monarch._src.actor.actor_mesh import Actor, context
2828
from monarch._src.actor.endpoint import endpoint
2929
from monarch._src.actor.future import Future
30-
from monarch._src.actor.proc_mesh import get_or_spawn_controller, ProcMesh
30+
from monarch._src.actor.v1 import get_or_spawn_controller, ProcMesh
3131
from pyre_extensions import none_throws
3232

3333

python/monarch/actor/__init__.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
Monarch Actor API - Public interface for actor functionality.
1010
"""
1111

12-
from monarch._rust_bindings.monarch_hyperactor.channel import ChannelTransport
1312
from monarch._rust_bindings.monarch_hyperactor.shape import Extent
1413
from monarch._src.actor.actor_mesh import (
1514
Accumulator,
@@ -35,19 +34,17 @@
3534
from monarch._src.actor.endpoint import endpoint
3635
from monarch._src.actor.future import Future
3736

38-
from monarch._src.actor.host_mesh import (
37+
from monarch._src.actor.host_mesh import hosts_from_config
38+
from monarch._src.actor.proc_mesh import local_proc_mesh, proc_mesh, sim_proc_mesh
39+
40+
from monarch._src.actor.v1 import (
41+
get_or_spawn_controller,
3942
HostMesh,
40-
hosts_from_config,
43+
ProcMesh,
4144
this_host,
4245
this_proc,
4346
)
44-
from monarch._src.actor.proc_mesh import (
45-
get_or_spawn_controller,
46-
local_proc_mesh,
47-
proc_mesh,
48-
ProcMesh,
49-
sim_proc_mesh,
50-
)
47+
5148

5249
__all__ = [
5350
"Accumulator",

0 commit comments

Comments
 (0)