more cleanup

allenwang28 · allenwang28 · commit 261aff7821ad · 2025-10-03T17:29:30.000-07:00
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
@@ -293,7 +293,9 @@ def create_server_handle(self) -> str:
 
 
 def get_launcher(cfg: LauncherConfig | None = None) -> BaseLauncher | None:
-    if not cfg or cfg.launcher == Launcher.SLURM:
+    if not cfg:
+        return None
+    if cfg.launcher == Launcher.SLURM:
         return Slurmlauncher()
     elif cfg.launcher == Launcher.MAST:
         return Mastlauncher(cfg)
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Remote resource allocation and provisioning."""
+"""Resource allocation and provisioning for both local and remote."""
 import asyncio
 import functools
 import logging
@@ -160,20 +160,40 @@ async def get_proc_mesh(
         mesh_name: Optional[str] = None,
         host_mesh: HostMesh | None = None,
         env_vars: dict[str, str] | None = None,
+        addr: str | None = None,
+        port: str | None = None,
     ):
         """Gets a proc mesh.
 
-        num_hosts = None implies that you want a local allocation, this may change.
+        Args:
+            num_procs: The number of processes to allocate.
+            with_gpus: Whether to include GPU allocations.
+                This only adds the CUDA_VISIBLE_DEVICES environment variable.
+            num_hosts: The number of hosts to allocate.
+                If this is set, a remote allocation is created.
+                If this is None, it uses the local host.
+                This behavior may change in the future.
+            host_mesh: The host mesh to allocate the process on.
+                If None, a new host mesh will be created.
+            port: The distributed port to use.
+                If None, a port will be detected.
+            addr: The distributed address to use.
+                If None, an address will be detected.
+
+        Returns:
+            A proc mesh.
 
         """
         if env_vars is None:
             env_vars = {}
 
+        is_remote = num_hosts is not None and num_hosts > 0
+
         async with self._lock:
             server_name = None
-            if num_hosts is not None and num_hosts > 0:
-                created_hosts = len(self._server_names)
+            if is_remote:
                 if mesh_name is None:
+                    created_hosts = len(self._server_names)
                     mesh_name = f"alloc_{created_hosts}"
                 if host_mesh is None:
                     host_mesh, server_name = await self.create_host_mesh(
@@ -188,18 +208,22 @@ async def get_proc_mesh(
                     host_id = host_mesh._host_id
                     gpu_manager = self._host_gpu_map[host_id]
             else:
+                # fallback to local
                 host_mesh = this_host()
                 gpu_manager = self._host_gpu_map[self._this_host_id]
                 host_mesh._host_id = self._this_host_id
 
             def bootstrap(env: dict[str, str]):
+                # bootstrap is run on all processes. We use this
+                # to set environment variables like CUDA etc.
                 import os
 
                 for k, v in env.items():
                     os.environ[k] = v
 
             if with_gpus:
-                addr, port = await get_remote_info(host_mesh)
+                if not addr or not port:
+                    addr, port = await get_remote_info(host_mesh)
                 gpu_ids = gpu_manager.get_gpus(num_procs)
 
                 env_vars["MASTER_ADDR"] = addr
@@ -213,7 +237,9 @@ def bootstrap(env: dict[str, str]):
                 per_host={"gpus": num_procs},
                 bootstrap=functools.partial(bootstrap, env=env_vars),
             )
-            await self.launcher.remote_setup(procs)
+
+            if is_remote:
+                await self.launcher.remote_setup(procs)
 
             # Tag the proc mesh with additional metadata for our own cleanup later
             if with_gpus:
@@ -284,8 +310,24 @@ async def get_proc_mesh(
     process_config: ProcessConfig,
     host_mesh: HostMesh | None = None,
     env_vars: dict[str, str] | None = None,
+    port: str | None = None,
+    addr: str | None = None,
 ) -> ProcMesh:
-    """Returns a proc mesh from the provisioner."""
+    """Returns a proc mesh from the provisioner.
+
+    Args:
+        process_config: The process config.
+        host_mesh: The host mesh to allocate the process on.
+            If None, a new host mesh will be created.
+        port: The distributed port to use.
+            If None, a port will be detected.
+        addr: The distributed address to use.
+            If None, an address will be detected.
+
+    Returns:
+        A proc mesh.
+
+    """
     provisioner = await _get_provisioner()
     return await provisioner.get_proc_mesh(
         num_procs=process_config.procs,
@@ -294,6 +336,8 @@ async def get_proc_mesh(
         mesh_name=process_config.mesh_name,
         host_mesh=host_mesh,
         env_vars=env_vars,
+        port=port,
+        addr=addr,
     )
 
 
diff --git a/src/forge/types.py b/src/forge/types.py
@@ -95,7 +95,17 @@ class Launcher(Enum):
 
 @dataclass
 class ProcessConfig:
-    """A proc_mesh config for the torchx scheduler."""
+    """A configuration for allocating Monarch ProcMeshes.
+
+    Args:
+        procs (int): Number of processes to launch for each replica of the service.
+        with_gpus (bool, optional): Whether to allocate GPUs for the service processes.
+        hosts (int | None, optional): Number of hosts to allocate for each replica.
+            If this is set to None, it will use the local host.
+            If this is set to a positive integer, it will run on a remote host.
+        mesh_name (str | None, optional): Name of the mesh to use for the proc_mesh.
+
+    """
 
     procs: int = 1
     with_gpus: bool = False
@@ -105,13 +115,15 @@ class ProcessConfig:
 
 @dataclass
 class ServiceConfig:
-    """
-    A service config.
+    """The configuration for a Forge service.
+
     Args:
         procs (int): Number of processes to launch for each replica of the service.
         num_replicas (int): Number of replicas to launch for the service.
         with_gpus (bool, optional): Whether to allocate GPUs for the service processes.
         hosts (int | None, optional): Number of hosts to allocate for each replica.
+            If this is set to None, it will use the local host.
+            If this is set to a positive integer, it will run on a remote host.
         health_poll_rate (float, optional): Frequency (in seconds) to poll for health status.
         replica_max_concurrent_requests (int, optional): Maximum number of concurrent requests per replica.
         return_first_rank_result (bool, optional): Whether to auto-unwrap ValueMesh to the first rank's result.
@@ -121,14 +133,14 @@ class ServiceConfig:
     num_replicas: int
     with_gpus: bool = False
     hosts: int | None = None
-    # ServiceConfig-specific fields
     health_poll_rate: float = 0.2
     replica_max_concurrent_requests: int = 10
     return_first_rank_result: bool = True
     mesh_name: str | None = None
 
     def to_process_config(self) -> ProcessConfig:
         """Extract ProcessConfig from this ServiceConfig.
+
         Maps procs to procs for ProcessConfig.
         """
         return ProcessConfig(
diff --git a/tests/unit_tests/test_provisioner.py b/tests/unit_tests/test_provisioner.py
@@ -161,6 +161,8 @@ async def test_get_proc_mesh_respects_cuda_visible_devices(self):
             num_procs=2,
             with_gpus=True,
             num_hosts=None,
+            port="12345",
+            addr="localhost",
         )
         # Verify GPUs were allocated from available set
         remaining_available = local_gpu_manager.get_available_gpus()
diff --git a/tests/unit_tests/test_replay_buffer.py b/tests/unit_tests/test_replay_buffer.py
@@ -15,7 +15,7 @@
 class TestReplayBuffer:
     @pytest_asyncio.fixture
     async def replay_buffer(self) -> ReplayBuffer:
-        replay_buffer = await ReplayBuffer.options(procs=1, with_gpus=True).as_actor(
+        replay_buffer = await ReplayBuffer.options(procs=1, with_gpus=False).as_actor(
             batch_size=2, max_policy_age=1
         )
         await replay_buffer.setup.call()

Original file line number	Diff line number	Diff line change
`@@ -161,6 +161,8 @@ async def test_get_proc_mesh_respects_cuda_visible_devices(self):`
`161`	`161`	`num_procs=2,`
`162`	`162`	`with_gpus=True,`
`163`	`163`	`num_hosts=None,`
	`164`	`+ port="12345",`
	`165`	`+ addr="localhost",`
`164`	`166`	`)`
`165`	`167`	`# Verify GPUs were allocated from available set`
`166`	`168`	`remaining_available = local_gpu_manager.get_available_gpus()`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@`
`15`	`15`	`class TestReplayBuffer:`
`16`	`16`	`@pytest_asyncio.fixture`
`17`	`17`	`async def replay_buffer(self) -> ReplayBuffer:`
`18`		`- replay_buffer = await ReplayBuffer.options(procs=1, with_gpus=True).as_actor(`
	`18`	`+ replay_buffer = await ReplayBuffer.options(procs=1, with_gpus=False).as_actor(`
`19`	`19`	`batch_size=2, max_policy_age=1`
`20`	`20`	`)`
`21`	`21`	`await replay_buffer.setup.call()`