move monarch to HostMeshV1

amirafzali · facebook-github-bot · commit 452e79b00a83 · 2025-10-16T14:44:43.000-07:00
Summary:
1. We can ride off SlurmJob to simplify the allocation logic

2. Some small modifications to ensure HostMeshV1 support

Differential Revision: D84853200
diff --git a/examples/monarch/train_distributed.py b/examples/monarch/train_distributed.py
@@ -14,78 +14,52 @@
 from typing import Dict
 
 import torch
-from monarch._rust_bindings.monarch_hyperactor.alloc import AllocConstraints, AllocSpec
-from monarch._src.actor.allocator import RemoteAllocator, TorchXRemoteAllocInitializer
-from monarch.actor import Actor, current_rank, endpoint, ProcMesh, this_host
-from monarch.tools import commands
-from monarch.tools.components import hyperactor
-from monarch.tools.config import Config
+from monarch.actor import Actor, current_rank, endpoint, HostMesh, ProcMesh, this_host
+from monarch.job import SlurmJob
 from monarch.utils import setup_env_for_distributed
 from torchtitan.config import ConfigManager, JobConfig
 from torchtitan.tools.logging import init_logger, logger
 from torchtitan.train import Trainer
 from utils.failure import Failure, FailureActor, FailureController
 
 
-# ==== Allocation boilerplate - much of this will be upstreamed into Monarch ====
+# ==== Allocation boilerplate ====
 class MonarchSlurm:
-    # Cluster Configuration - update these values for your specific cluster
-    machine: str = "gpu.xlarge"
-    machine_memory: int = 2062607
     job_name_prefix: str = "monarch-torchft"
 
     def __init__(self):
-        self.job_handles: Dict[str, str] = {}
+        self.job_handles: Dict[str, SlurmJob] = {}
         atexit.register(self.kill_jobs)
 
-    def get_config(self, mesh_name: str, nodes_per_mesh: int) -> Config:
-        mesh = [f"{mesh_name}:{nodes_per_mesh}:{MonarchSlurm.machine}"]
-        # to enable relative import of utils on actors
-        current_dir = os.path.dirname(os.path.abspath(__file__))
-        env = {"PYTHONPATH": current_dir}
-
-        appdef = hyperactor.host_mesh(meshes=mesh, env=env)
-
-        for role in appdef.roles:
-            role.resource.memMB = MonarchSlurm.machine_memory
-
-        return Config(scheduler="slurm", appdef=appdef)
-
-    async def get_or_create_job(self, mesh_name: str, nodes_per_mesh: int = 1) -> None:
-        config = self.get_config(mesh_name, nodes_per_mesh)
-        job_name = f"{MonarchSlurm.job_name_prefix}-{mesh_name}"
-        server_spec = await commands.get_or_create(job_name, config, force_restart=True)
-        self.job_handles[mesh_name] = server_spec.name
+    async def get_or_create_job(
+        self, mesh_name: str, nodes_per_mesh: int = 1, gpus_per_node: int = 8
+    ) -> None:
+        job = SlurmJob(
+            meshes={mesh_name: nodes_per_mesh},
+            gpus_per_node=gpus_per_node,
+            job_name=f"{self.job_name_prefix}-{mesh_name}",
+        )
+        job.apply()
+        self.job_handles[mesh_name] = job
 
     def kill_jobs(self):
         for mesh_name in self.job_handles.keys():
             self.kill_job(mesh_name)
 
     def kill_job(self, mesh_name: str):
         try:
-            job_handle = self.job_handles[mesh_name]
+            job = self.job_handles[mesh_name]
             logger.info(f"Destroying job for mesh {mesh_name}")
-            commands.kill(f"slurm:///{job_handle}")
+            job.kill()
         except Exception as e:
-            logger.warning(f"Failed to destroy job for {mesh_name}: {e}")
-
-    def proc_mesh(
-        self,
-        mesh_name: str,
-        num_hosts: int = 1,
-        num_gpus: int = 8,
-    ) -> ProcMesh:
-        allocator = RemoteAllocator(
-            world_id=MonarchSlurm.job_name_prefix,
-            initializer=TorchXRemoteAllocInitializer(
-                f"slurm:///{self.job_handles[mesh_name]}"
-            ),
-        )
-        alloc = allocator.allocate(
-            AllocSpec(AllocConstraints(), hosts=num_hosts, gpus=num_gpus)
-        )
+            logger.exception(f"Failed to destroy job for {mesh_name}: {e}")
 
-        return ProcMesh.from_alloc(alloc)
+    def proc_mesh(self, mesh_name: str, num_procs: int) -> ProcMesh:
+        job = self.job_handles[mesh_name]
+        cached_path = f".{mesh_name}/job_state.pkl"
+        mesh: HostMesh = getattr(job.state(cached_path=cached_path), mesh_name)
+        proc_mesh = mesh.spawn_procs({"gpus": num_procs})
+        return proc_mesh
 
 
 # ==== allocation boilerplate ====
@@ -177,34 +151,31 @@ async def start_replica(self) -> None:
         init_logger()
         logger.info(f"{self.uid} Spawning trainers")
 
-        trainers_proc_mesh: ProcMesh | None = None
         try:
             trainers_proc_mesh = self.scheduler.proc_mesh(
                 f"replica_{self.replica_id}",
-                self.spec.hosts_per_replica,
-                self.spec.gpus_per_node,
-            )
-            await trainers_proc_mesh.logging_option(stream_to_client=True)
-            await setup_env_for_distributed(trainers_proc_mesh)
-
-            training_actors = trainers_proc_mesh.spawn(
-                "training_actors",
-                TrainingActor,
-                self.spec.job_config,
-                self.replica_id,
+                num_procs=self.spec.gpus_per_node,
             )
 
-            self.failure_actors = trainers_proc_mesh.spawn(
-                "failure_actors", FailureActor
-            )
+            async with trainers_proc_mesh:
+                await trainers_proc_mesh.logging_option(stream_to_client=True)
+                await setup_env_for_distributed(trainers_proc_mesh)
 
-            logger.info(f"{self.uid} Starting trainers")
-            await training_actors.start_training.call(self.spec.lighthouse_address)
-            await trainers_proc_mesh.stop()
-        except Exception as e:
-            if trainers_proc_mesh:
-                await trainers_proc_mesh.stop()
-            raise e
+                training_actors = trainers_proc_mesh.spawn(
+                    "training_actors",
+                    TrainingActor,
+                    self.spec.job_config,
+                    self.replica_id,
+                )
+
+                self.failure_actors = trainers_proc_mesh.spawn(
+                    "failure_actors", FailureActor
+                )
+
+                logger.info(f"{self.uid} Starting trainers")
+                await training_actors.start_training.call(self.spec.lighthouse_address)
+        except Exception:
+            raise
 
     @endpoint
     async def inject_failure(self, failure_type: Failure):
@@ -216,8 +187,7 @@ async def inject_failure(self, failure_type: Failure):
 
                 await self.failure_actors.fail.choose(failure_type)
             except Exception as e:
-                error_msg = f"{self.uid} Injected failure: {e}"
-                logger.error(error_msg)
+                logger.exception(f"{self.uid} Injected failure: {e}")
         else:
             error_msg = f"{self.uid} No failure actors available"
             logger.error(error_msg)
@@ -268,7 +238,7 @@ async def start_training(self) -> None:
     async def start_lighthouse(self) -> None:
         if self.spec.remote_lighthouse:
             await self.scheduler.get_or_create_job("lighthouse")
-            self.lighthouse_mesh = self.scheduler.proc_mesh("lighthouse", num_gpus=1)
+            self.lighthouse_mesh = self.scheduler.proc_mesh("lighthouse", num_procs=1)
         else:
             self.lighthouse_mesh = this_host().spawn_procs({"gpus": 1})
 
@@ -287,7 +257,7 @@ async def stop_lighthouse(self) -> None:
                 await self.lighthouse_mesh.stop()
             logger.info("[Controller] Lighthouse stopped")
         except Exception as e:
-            logger.warning(f"[Controller] Failed to stop lighthouse: {e}")
+            logger.exception(f"[Controller] Failed to stop lighthouse: {e}")
 
     async def _run_replica(self, replica_id: int, attempt_number: int) -> None:
         if attempt_number >= MAX_ATTEMPT:
@@ -300,7 +270,7 @@ async def _run_replica(self, replica_id: int, attempt_number: int) -> None:
             await self._teardown(replica_id)
         except Exception as e:
             await self._teardown(replica_id)
-            logger.info(f"[Controller] replica {replica_id} failed: {e}")
+            logger.exception(f"[Controller] replica {replica_id} failed: {e}")
             await self._run_replica(replica_id, attempt_number + 1)
 
     async def _spin_up_replica(self, replica_id: int, attempt_number: int = 0) -> None:
@@ -332,11 +302,18 @@ async def _spin_up_replica(self, replica_id: int, attempt_number: int = 0) -> No
     async def _teardown(self, replica_id: int) -> None:
         try:
             replica = self.replicas[replica_id]
-            await replica.proc_mesh.stop()
+            try:
+                await replica.proc_mesh.stop()
+            except Exception as e:
+                logger.exception(
+                    f"[Controller] Failed to stop replica {replica_id}, it may already be stopped. {e}"
+                )
             del self.replicas[replica_id]
             del replica.proc_mesh
         except Exception as e:
-            logger.error(f"[Controller] Failed to _teardown replica {replica_id}: {e}")
+            logger.exception(
+                f"[Controller] Failed to teardown replica {replica_id}: {e}"
+            )
 
 
 # === CLI / CONFIG === #
diff --git a/examples/monarch/utils/failure.py b/examples/monarch/utils/failure.py
@@ -126,11 +126,11 @@ async def execute_failures(
                         f"[FailureController] Failure injection ({last_failure}) sent to replica {last_replica.rid}"
                     )
                 except Exception as e:
-                    logger.info(
+                    logger.exception(
                         f"[FailureController] Failed to inject failure in replica {last_replica.rid}: {e}"
                     )
                 await asyncio.sleep(rest_time)
             except Exception as e:
-                logger.info(
+                logger.exception(
                     f"[FailureController] Something went wrong while injecting failure: {e}"
                 )

Original file line number	Diff line number	Diff line change
`@@ -126,11 +126,11 @@ async def execute_failures(`
`126`	`126`	`f"[FailureController] Failure injection ({last_failure}) sent to replica {last_replica.rid}"`
`127`	`127`	`)`
`128`	`128`	`except Exception as e:`
`129`		`- logger.info(`
	`129`	`+ logger.exception(`
`130`	`130`	`f"[FailureController] Failed to inject failure in replica {last_replica.rid}: {e}"`
`131`	`131`	`)`
`132`	`132`	`await asyncio.sleep(rest_time)`
`133`	`133`	`except Exception as e:`
`134`		`- logger.info(`
	`134`	`+ logger.exception(`
`135`	`135`	`f"[FailureController] Something went wrong while injecting failure: {e}"`
`136`	`136`	`)`