updates

amirafzali · amirafzali · commit 911abd44b266 · 2025-09-30T21:10:02.000Z
diff --git a/examples/monarch/train_distributed.py b/examples/monarch/train_distributed.py
@@ -101,7 +101,7 @@ def start_lighthouse(self) -> str:
         from torchft.coordination import LighthouseServer
 
         self.lighthouse = LighthouseServer(
-            bind="[::]:0", min_replicas=1, join_timeout_ms=10000
+            bind="[::]:0", min_replicas=1, join_timeout_ms=60000
         )
         return self.lighthouse.address()
 
@@ -184,9 +184,7 @@ async def start_replica(self) -> None:
                 self.spec.hosts_per_replica,
                 self.spec.gpus_per_node,
             )
-            await trainers_proc_mesh.logging_option(
-                stream_to_client=True, aggregate_window_sec=None
-            )
+            await trainers_proc_mesh.logging_option(stream_to_client=True)
             await setup_env_for_distributed(trainers_proc_mesh)
 
             training_actors = trainers_proc_mesh.spawn(
@@ -228,9 +226,9 @@ async def inject_failure(self, failure_type: Failure):
 # delay before re-creating proc mesh on existing job. change as needed.
 PROC_ATTEMPT_DELAY = 0
 # proc attempts before getting a new scheduler allocation. change as needed.
-PROC_ATTEMPTS = 3
+PROC_ATTEMPTS = 4
 # attempts before failing training on replica. change as needed.
-MAX_ATTEMPT = PROC_ATTEMPTS * 3
+MAX_ATTEMPT = PROC_ATTEMPTS * 4
 
 
 class OrchestrationManager:
@@ -274,9 +272,7 @@ async def start_lighthouse(self) -> None:
         else:
             self.lighthouse_mesh = this_host().spawn_procs({"gpus": 1})
 
-        await self.lighthouse_mesh.logging_option(
-            stream_to_client=True, aggregate_window_sec=None
-        )
+        await self.lighthouse_mesh.logging_option(stream_to_client=True)
         self.lighthouse_actor = self.lighthouse_mesh.spawn(
             "lighthouse_actor", LighthouseActor
         )
@@ -337,8 +333,8 @@ async def _teardown(self, replica_id: int) -> None:
         try:
             replica = self.replicas[replica_id]
             await replica.proc_mesh.stop()
-            del replica.proc_mesh
             del self.replicas[replica_id]
+            del replica.proc_mesh
         except Exception as e:
             logger.error(f"[Controller] Failed to _teardown replica {replica_id}: {e}")
 
@@ -418,12 +414,17 @@ def make_job_spec(args: argparse.Namespace) -> JobSpec:
         "--fault_tolerance.enable",
         "--fault_tolerance.group_size",
         str(args.replica_count),
+        "--fault_tolerance.process_group",
+        "nccl",
+        "--fault_tolerance.process_group_timeout_ms",
+        "60000",
+
         "--parallelism.data_parallel_shard_degree",
         str(data_parallel_shard_degree),
         "--activation_checkpoint.mode",
         "full",
         "--comm.train_timeout_seconds",
-        "60",
+        "300",
         "--training.steps",
         str(args.training_steps),
         "--training.dataset",
diff --git a/examples/monarch/utils/failure.py b/examples/monarch/utils/failure.py
@@ -91,9 +91,12 @@ def kill_slurm(scheduler: "MonarchSlurm") -> None:
         scheduler.kill_job(selected)
 
     @staticmethod
-    async def execute_failures(replicas: Dict[int, "Replica"], scheduler: "MonarchSlurm"):
-        startup_wait = 30
-        rest_time = 60
+    async def execute_failures(
+        replicas: Dict[int, "Replica"],
+        scheduler: "MonarchSlurm",
+        startup_wait: int = 120,
+        rest_time: int = 120
+    ):
         logger.info(f"[FailureController] Starting failure injection in {startup_wait} seconds")
         await asyncio.sleep(startup_wait)  # allow startups.
 
@@ -102,8 +105,8 @@ async def execute_failures(replicas: Dict[int, "Replica"], scheduler: "MonarchSl
             try:
                 running_replicas = list(replicas.values())
                 # allow deadlocked replicas more time to recover
-                if last_failure == Failure.DEADLOCK and last_replica in running_replicas:
-                    running_replicas.remove(last_replica)
+                if last_failure == Failure.DEADLOCK and last_replica:
+                    running_replicas = [r for r in running_replicas if r.rid != last_replica.rid]
                 
                 last_replica = random.choice(running_replicas)
                 last_failure = random.choice(list(Failure))