Modifies service.py to use get_proc_mesh, and unifies the ProcConfig with ServiceConfig (#63)

allenwang28 · Allen Wang · web-flow · commit 5d0d7a849e8e · 2025-08-20T18:02:12.000-04:00
* [service] Removes autoscaling!

* changes recoverable proc mesh to use get_prc_mesh

---------

Co-authored-by: Allen Wang &lt;allencwang@fb.com&gt;
diff --git a/src/forge/controller/recoverable_mesh.py b/src/forge/controller/recoverable_mesh.py
@@ -44,9 +44,12 @@
 
 from monarch._rust_bindings.monarch_hyperactor.shape import Shape, Slice
 from monarch._src.actor.actor_mesh import Actor
-from monarch._src.actor.proc_mesh import proc_mesh, ProcMesh
+from monarch._src.actor.proc_mesh import ProcMesh
 from monarch._src.actor.shape import MeshTrait
 
+from forge.controller.proc_mesh import get_proc_mesh
+from forge.types import ProcessConfig
+
 T = TypeVar("T", bound=Actor)
 logger: logging.Logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -82,18 +85,19 @@ class RecoverableProcMesh(MeshTrait):
     services that need high availability.
 
     Args:
-        num_gpus: Number of GPUs to allocate for the process mesh
+        proc_config: ProcessConfig containing mesh configuration including num_procs
 
     Attributes:
-        num_gpus: Number of GPUs allocated to this mesh
+        num_procs: Number of processes allocated to this mesh
         state: Current state of the mesh (HEALTHY, RECOVERING, UNHEALTHY, STOPPED)
         healthy: True if the mesh is operational and ready for requests
         failed: True if the mesh has failed and needs recovery
 
     Example:
         Basic usage with automatic recovery:
 
-        >>> mesh = RecoverableProcMesh(num_gpus=2)
+        >>> proc_config = ProcessConfig(num_procs=2, scheduler="local")
+        >>> mesh = RecoverableProcMesh(proc_config)
         >>>
         >>> async def setup_actor(proc_mesh):
         ...     actor = await proc_mesh.spawn("MyActor", MyActorClass)
@@ -104,7 +108,8 @@ class RecoverableProcMesh(MeshTrait):
 
         Context manager for automatic cleanup:
 
-        >>> async with RecoverableProcMesh(num_gpus=1) as mesh:
+        >>> proc_config = ProcessConfig(num_procs=1)
+        >>> async with RecoverableProcMesh(proc_config) as mesh:
         ...     await mesh.spawn(setup_actor)
         ...     # Use mesh for operations
         ...     # Mesh automatically stopped and cleaned up on exit
@@ -121,9 +126,10 @@ class RecoverableProcMesh(MeshTrait):
 
     def __init__(
         self,
-        num_procs: int,
+        proc_config: ProcessConfig,
     ) -> None:
-        self.num_procs = num_procs
+        self._proc_config: ProcessConfig = proc_config
+        self.num_procs = proc_config.num_procs
         self._proc_mesh: Optional[ProcMesh] = None
         self._recovery_task: Optional[asyncio.Task[None]] = None
         self.state: MeshState = MeshState.UNHEALTHY
@@ -185,7 +191,7 @@ async def _recover(
                 logger.warning(f"Error stopping old ProcMesh: {e}")
 
         try:
-            self._proc_mesh = await proc_mesh(gpus=self.num_procs)
+            self._proc_mesh = await get_proc_mesh(process_config=self._proc_config)
             if self._proc_mesh is not None:
                 await hook(self._proc_mesh)
             self.state = MeshState.HEALTHY
diff --git a/src/forge/controller/service.py b/src/forge/controller/service.py
@@ -47,6 +47,7 @@
 from monarch.actor import ActorError, ProcMesh
 
 from forge.controller import RecoverableProcMesh
+from forge.types import ServiceConfig
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -187,17 +188,6 @@ def get_sessions_per_replica(self) -> float:
         return self.total_sessions / self.healthy_replicas
 
 
-@dataclass
-class ServiceConfig:
-    procs_per_replica: int
-    num_replicas: int
-    health_poll_rate: float = 0.2
-    replica_max_concurrent_requests: int = 10
-    return_first_rank_result: bool = (
-        True  # Auto-unwrap ValueMesh to first rank's result
-    )
-
-
 @dataclass
 class Replica:
     proc_mesh: RecoverableProcMesh
@@ -335,9 +325,7 @@ async def __initialize__(self):
         replicas = []
         num_replicas = self._cfg.num_replicas
         for i in range(num_replicas):
-            mesh = RecoverableProcMesh(
-                self._cfg.procs_per_replica,
-            )
+            mesh = RecoverableProcMesh(proc_config=self._cfg.to_process_config())
             replica = Replica(
                 proc_mesh=mesh,
                 actor=None,
diff --git a/src/forge/types.py b/src/forge/types.py
@@ -98,3 +98,35 @@ class ProcessConfig:
     oncall: str = "torchtune"
     identity: str = "pytorch_distributed"
     image: str = "forge_workspace:latest"
+
+
+@dataclass
+class ServiceConfig:
+    """A service config."""
+
+    procs_per_replica: int
+    num_replicas: int
+    num_hosts: int = 1
+    scheduler: Literal["mast", "local"] = "local"
+    oncall: str = "torchtune"
+    identity: str = "pytorch_distributed"
+    image: str = "forge_workspace:latest"
+    # ServiceConfig-specific fields
+    health_poll_rate: float = 0.2
+    replica_max_concurrent_requests: int = 10
+    return_first_rank_result: bool = (
+        True  # Whether or not to auto-unwrap ValueMesh to first rank's result
+    )
+
+    def to_process_config(self) -> ProcessConfig:
+        """Extract ProcessConfig from this ServiceConfig.
+        Maps procs_per_replica to num_procs for ProcessConfig.
+        """
+        return ProcessConfig(
+            scheduler=self.scheduler,
+            num_procs=self.procs_per_replica,
+            num_hosts=self.num_hosts,
+            oncall=self.oncall,
+            identity=self.identity,
+            image=self.image,
+        )