meta-pytorch · DNXie · Oct 14, 2025 · Oct 8, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -490,23 +490,10 @@ async def continuous_training():
         training_task.cancel()
     finally:
         print("Shutting down...")
-
         # give mlogger time to shutdown backends, otherwise they can stay running.
         # TODO (felipemello) find more elegant solution
         await mlogger.shutdown.call_one()
         await asyncio.sleep(2)
-
-        await asyncio.gather(
-            DatasetActor.shutdown(dataloader),
-            policy.shutdown(),
-            RLTrainer.shutdown(trainer),
-            ReplayBuffer.shutdown(replay_buffer),
-            ComputeAdvantages.shutdown(compute_advantages),
-            ref_model.shutdown(),
-            reward_actor.shutdown(),
-        )
-        # TODO - add a global shutdown that implicitly shuts down all services
-        # and remote allocations
         await shutdown()
 
 

diff --git a/src/forge/actors/policy.py b/src/forge/actors/policy.py
@@ -223,7 +223,7 @@ async def launch(  # pyright: ignore[reportIncompatibleMethodOverride]
 
     @classmethod
     async def shutdown(  # pyright: ignore[reportIncompatibleMethodOverride]
-        cls: type["Policy"], actor: "Policy"
+        cls: type["Policy"], actor: "Policy", quiet: bool = False
     ):
         assert (
             actor._policy_proc is not None

diff --git a/src/forge/controller/actor.py b/src/forge/controller/actor.py
@@ -12,7 +12,7 @@
 
 from monarch.actor import Actor, current_rank, current_size, endpoint
 
-from forge.controller.provisioner import get_proc_mesh, stop_proc_mesh
+from forge.controller.provisioner import _get_provisioner, get_proc_mesh, stop_proc_mesh
 
 from forge.types import ProcessConfig, ServiceConfig
 
@@ -127,7 +127,9 @@ async def as_service(
         logger.info("Spawning Service for %s", cls.__name__)
         service = Service(cfg, cls, actor_args, actor_kwargs)
         await service.__initialize__()
-        return ServiceInterface(service, cls)
+        service_interface = ServiceInterface(service, cls)
+        await cls.register_allocation(service_interface)
+        return service_interface
 
     @endpoint
     async def setup(self):
@@ -144,6 +146,17 @@ async def setup(self):
         """
         pass
 
+    @classmethod
+    async def register_allocation(cls, alloc: "ForgeActor | ServiceInterface") -> None:
+        """Registers an allocation (service/actor) with the provisioner."""
+        provisioner = await _get_provisioner()
+        try:
+            provisioner = await _get_provisioner()
+            if provisioner is not None:
+                await provisioner.track_allocation(alloc)
+        except Exception as e:
+            logger.warning(f"Failed to register allocation {alloc}: {e}")
+
     @classmethod
     async def launch(cls, *args, **kwargs) -> "ForgeActor":
         """Provisions and deploys a new actor.
@@ -185,13 +198,16 @@ async def as_actor(cls: Type[T], *args, **actor_kwargs) -> T:
         """
         logger.info("Spawning single actor %s", cls.__name__)
         actor = await cls.launch(*args, **actor_kwargs)
+        await cls.register_allocation(actor)
         return actor
 
     @classmethod
-    async def shutdown(cls, actor: "ForgeActor"):
+    async def shutdown(cls, actor: "ForgeActor", quiet: bool = False):
         """Shuts down an actor.
         This method is used by `Service` to teardown a replica.
         """
+        if not quiet:
+            logger.info(f"Shutting down actor {getattr(actor, 'name', cls.__name__)}")
         if actor._proc_mesh is None:
             raise AssertionError("Called shutdown on a replica with no proc_mesh.")
         await stop_proc_mesh(actor._proc_mesh)
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -12,7 +12,7 @@
 import os
 import socket
 import uuid
-from typing import Optional
+from typing import Any, Optional
 
 from monarch._src.actor.shape import NDSlice, Shape
 from monarch.actor import Actor, endpoint, HostMesh, ProcMesh, this_host
@@ -132,6 +132,8 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
         if not self.launcher:
             logger.warning("Launcher not provided, remote allocations will not work.")
 
+        self._allocations: list[Any] = []  # all live actor/service instances
+
     async def initialize(self):
         """Call this after creating the instance"""
         if self.launcher is not None:
@@ -303,8 +305,46 @@ async def stop_proc_mesh(self, proc_mesh: ProcMesh):
                 commands.kill(server_name)
             del self._proc_host_map[proc_mesh]
 
+    async def track_allocation(self, alloc: Any):
+        """Tracks an allocation for cleanup."""
+        from forge.controller.service import ServiceInterface
+
+        self._allocations.append(alloc)
+        alloc_type = "service" if isinstance(alloc, ServiceInterface) else "actor"
+        print(
+            f"Registered allocation {alloc_type} {alloc}, current allocations len: {len(self._allocations)}"
+        )
+
+    async def shutdown_all_allocations(self):
+        """Gracefully shut down all tracked actors and services."""
+        from monarch._src.actor.actor_mesh import ActorMesh
+
+        from forge.controller.actor import ForgeActor
+        from forge.controller.service import ServiceInterface
+
+        for alloc in reversed(self._allocations):
+            try:
+                # --- ServiceInterface ---
+                if isinstance(alloc, ServiceInterface):
+                    await alloc.shutdown()
+
+                # --- Actor instance (ForgeActor or underlying ActorMesh) ---
+                elif isinstance(alloc, (ForgeActor, ActorMesh)):
+                    # Get the class to call shutdown on (ForgeActor or its bound class)
+                    actor_cls = getattr(alloc, "_class", None) or alloc.__class__
+                    await actor_cls.shutdown(alloc)
+
+                else:
+                    logger.warning(f"Unknown allocation type: {type(alloc)}")
+
+            except Exception as e:
+                logger.warning(f"Failed to shut down {alloc}: {e}")
+
+        self._allocations.clear()
+
     async def shutdown(self):
         """Tears down all remaining remote allocations."""
+        await self.shutdown_all_allocations()
         async with self._lock:
             for server_name in self._server_names:
                 commands.kill(server_name)

diff --git a/src/forge/controller/service/interface.py b/src/forge/controller/service/interface.py
@@ -200,6 +200,7 @@ async def shutdown(self) -> None:
         """
         Shut down the underlying Service.
         """
+        logger.info(f"Shutting down service {self.actor_def.__name__}")
         await self._service.stop()
 
     def session(self) -> "SessionContext":

diff --git a/src/forge/controller/service/replica.py b/src/forge/controller/service/replica.py
@@ -405,7 +405,7 @@ async def stop(self):
         # Stop the actor
         if self.actor:
             try:
-                await self.actor_def.shutdown(self.actor)
+                await self.actor_def.shutdown(self.actor, quiet=True)
             except Exception as e:
                 logger.warning(
                     "Error stopping proc_mesh for replica %d: %s", self.idx, e

diff --git a/tests/sandbox/rl_trainer/main.py b/tests/sandbox/rl_trainer/main.py
@@ -222,7 +222,6 @@ async def continuous_training():
         print("Training interrupted by user")
     finally:
         print("Shutting down trainer...")
-        await RLTrainer.shutdown(trainer)
         await mlogger.shutdown.call_one()
         await shutdown()
         print("Trainer shutdown complete.")

diff --git a/tests/sandbox/toy_rl/sumdigits.py b/tests/sandbox/toy_rl/sumdigits.py
@@ -568,15 +568,6 @@ async def continuous_training():
         training_task.cancel()
     finally:
         print("Shutting down...")
-        await asyncio.gather(
-            DatasetActor.shutdown(dataloader),
-            policy.shutdown(),
-            Trainer.shutdown(trainer),
-            ReplayBuffer.shutdown(replay_buffer),
-            reward_actor.shutdown(),
-        )
-        # TODO - add a global shutdown that implicitly shuts down all services
-        # and remote allocations
         await shutdown()
 
 

diff --git a/tests/sandbox/toy_rl/toy_metrics/main.py b/tests/sandbox/toy_rl/toy_metrics/main.py
@@ -112,12 +112,6 @@ async def main():
     # shutdown
     await mlogger.shutdown.call_one()
     await asyncio.sleep(2)
-
-    await asyncio.gather(
-        trainer.shutdown(),
-        generator.shutdown(),
-    )
-
     await shutdown()
 
 

diff --git a/tests/sandbox/vllm/main.py b/tests/sandbox/vllm/main.py
@@ -67,7 +67,6 @@ async def run(cfg: DictConfig):
         print("-" * 80)
 
     print("\nShutting down...")
-    await policy.shutdown()
     await shutdown()