Skip to content

Commit 2069b3d

Browse files
committed
documentation
1 parent 4bdc10e commit 2069b3d

File tree

3 files changed

+15
-0
lines changed

3 files changed

+15
-0
lines changed

apps/grpo/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,9 @@ async def main(cfg: DictConfig):
363363

364364
# In the HostMesh v1 case, we spawn a torchstore storage volume
365365
# per trainer process.
366+
# We initialize after service initialization because torchstore currently
367+
# requires access to the underlying proc meshes in the local rank strategy.
368+
# We should be able to hide this in the future.
366369
if MONARCH_HOSTMESH_V1.get_value():
367370
# TODO: support multiple host meshes
368371
trainer_num_procs = cfg.actors.trainer["procs"]

src/forge/controller/launcher.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,9 @@ async def remote_setup(self, procs: ProcMesh) -> tuple[str, int]:
121121
class Slurmlauncher(BaseLauncher):
122122
async def initialize(self) -> None:
123123
if MONARCH_HOSTMESH_V1.get_value():
124+
# HostMeshV1 currently requires explicit configuration
125+
# of the underlying transport from client to mesh.
126+
# This can be removed in the future once this has been removed.
124127
configure(default_transport=ChannelTransport.Tcp)
125128

126129
async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]:
@@ -178,6 +181,9 @@ def __init__(self, cfg: LauncherConfig | None = None):
178181

179182
async def initialize(self) -> None:
180183
if MONARCH_HOSTMESH_V1.get_value():
184+
# HostMeshV1 currently requires explicit configuration
185+
# of the underlying transport from client to mesh.
186+
# This can be removed in the future once this has been removed.
181187
configure(default_transport=ChannelTransport.MetaTlsWithHostname)
182188

183189
await self.launch_mast_job()

src/forge/controller/provisioner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,12 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
183183
return host_mesh, server_name
184184

185185
def get_host_mesh(self, name: str) -> HostMesh:
186+
"""Returns the host mesh given its associated name.
187+
188+
This is currently an experimental API for HostMesh v1 and
189+
should not be relied on longer term.
190+
191+
"""
186192
return self._host_mesh_map[name]
187193

188194
async def get_proc_mesh(

0 commit comments

Comments
 (0)