meta-pytorch · DNXie · Oct 13, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -349,6 +349,11 @@ async def main(cfg: DictConfig):
         ),
     )
 
+    if "steps" not in cfg.trainer.training:
+        raise ValueError("`cfg.trainer.training.steps` must be defined (can be null).")
+
+    max_steps = cfg.trainer.training.steps
+
     print("All services initialized successfully!")
 
     # ---- Core RL loops ---- #
@@ -434,7 +439,7 @@ async def continuous_training():
         training_step = 0
         restart_tracer = True  # Flag to control when to restart tracer
 
-        while True:
+        while max_steps is None or training_step < max_steps:
             # Restart tracer when needed (initial start or after completing a training step)
             # Otherwise, we cannot measure time waiting for buffer
             if restart_tracer:
@@ -471,6 +476,10 @@ async def continuous_training():
                 # Flush metrics every training step to WandB
                 await mlogger.flush.call_one(training_step)
 
+        print(
+            f"Reached training limit ({max_steps} steps). Exiting continuous_training loop."
+        )
+
     num_rollout_threads = cfg.get("rollout_threads", 1)
     num_training_threads = cfg.get("training_threads", 1)
     print(
@@ -482,14 +491,18 @@ async def continuous_training():
     training_task = asyncio.create_task(continuous_training())
 
     try:
-        await asyncio.gather(*rollout_tasks, training_task)
+        await training_task
     except KeyboardInterrupt:
         print("Training interrupted by user")
+    finally:
+        print("Shutting down...")
         for rollout_task in rollout_tasks:
             rollout_task.cancel()
+        # graceful await all tasks, ignore cancellation noise
+        await asyncio.gather(*rollout_tasks, return_exceptions=True)
+        # Give replicas time to drain and complete in-flight requests
+        await asyncio.sleep(1)
         training_task.cancel()
-    finally:
-        print("Shutting down...")
 
         # give mlogger time to shutdown backends, otherwise they can stay running.
         # TODO (felipemello) find more elegant solution

diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py
@@ -266,6 +266,11 @@ async def _process_single_request(self, request: ServiceRequest) -> bool:
                 # can be healthy but the request failed.
                 self.mark_failed()
                 success = False
+            except asyncio.InvalidStateError:
+                # Future was already cancelled — safe to ignore during shutdown
+                self.mark_failed()
+                success = False
+                pass
             except Exception as e:
                 logger.debug(f"Got unexpected error on replica {self.idx}. Error:\n{e}")
                 self.mark_failed()