Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/monarch/train_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import torch
from monarch.actor import Actor, current_rank, endpoint, HostMesh, ProcMesh, this_host
from monarch.job import SlurmJob
from monarch.utils import setup_env_for_distributed
from monarch.spmd import setup_torch_elastic_env_async
from torchtitan.config import ConfigManager, JobConfig
from torchtitan.tools.logging import init_logger, logger
from torchtitan.train import Trainer
Expand Down Expand Up @@ -157,7 +157,7 @@ async def start_replica(self) -> None:

async with trainers_proc_mesh:
await trainers_proc_mesh.logging_option(stream_to_client=True)
await setup_env_for_distributed(trainers_proc_mesh)
await setup_torch_elastic_env_async(trainers_proc_mesh)

training_actors = trainers_proc_mesh.spawn(
"training_actors",
Expand Down
Loading