changes for running

allenwang28 · allenwang28 · commit 37cb96df8b9c · 2025-10-10T13:38:42.000-07:00
diff --git a/apps/grpo/main.py b/apps/grpo/main.py
@@ -28,6 +28,7 @@
 from forge.controller.actor import ForgeActor
 from forge.controller.provisioner import init_provisioner, shutdown
 from forge.data.rewards import MathReward, ThinkingReward
+from forge.env import MONARCH_HOSTMESH_V1
 from forge.observability.metric_actors import get_or_create_metric_logger
 from forge.observability.metrics import record_metric, Reduce
 from forge.observability.perf_tracker import Tracer
@@ -314,14 +315,23 @@ async def main(cfg: DictConfig):
     max_res_tokens = cfg.max_res_tokens
 
     # ---- Global setups ---- #
+    provisioner = None
     if cfg.get("provisioner", None) is not None:
-        await init_provisioner(
+        provisioner = await init_provisioner(
             ProvisionerConfig(launcher_config=LauncherConfig(**cfg.provisioner))
         )
+    else:
+        provisioner = await init_provisioner()
+
     metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
     mlogger = await get_or_create_metric_logger()
     await mlogger.init_backends.call_one(metric_logging_cfg)
-    await ts.initialize(strategy=ts.ControllerStorageVolumes())
+
+    # In the host mesh v0 case, actors on remote hosts are not able to communicate
+    # with one another. Therefore we use the controller as our storage volume.
+    if provisioner is None or not MONARCH_HOSTMESH_V1.get_value():
+        await ts.initialize(strategy=ts.ControllerStorageVolumes())
+        print("Torchstore successfuly initialized with controller storage strategy")
 
     # ---- Setup services ---- #
 
@@ -351,6 +361,16 @@ async def main(cfg: DictConfig):
 
     print("All services initialized successfully!")
 
+    if provisioner is not None and MONARCH_HOSTMESH_V1.get_value():
+        # TODO: support multiple host meshes
+        trainer_host_mesh_name = cfg.actors.trainer["mesh_name"]
+        trainer_hosts = provisioner.get_host_mesh(trainer_host_mesh_name)
+        await ts.initialize(
+            mesh=trainer_hosts.spawn_procs(per_host={"gpus": 8}),
+            strategy=ts.LocalRankStrategy(),
+        )
+        print("Torchstore successfuly initialized with local rank strategy")
+
     # ---- Core RL loops ---- #
     async def continuous_rollouts():
         rollout_count = 0
diff --git a/apps/grpo/qwen3_1_7b.yaml b/apps/grpo/qwen3_1_7b.yaml
@@ -117,26 +117,33 @@ services:
   policy:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 1
+    mesh_name: policy
     with_gpus: true
   ref_model:
     procs: 1
     num_replicas: 1
+    mesh_name: ref_model
     with_gpus: true
   reward_actor:
     procs: 1
     num_replicas: 1
+    mesh_name: reward_actor
     with_gpus: false
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 1
     with_gpus: true
+    mesh_name: trainer
   replay_buffer:
     procs: 1
     with_gpus: false
+    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
+    mesh_name: compute_advantages
diff --git a/apps/grpo/qwen3_32b.yaml b/apps/grpo/qwen3_32b.yaml
@@ -122,26 +122,33 @@ services:
     num_replicas: 1
     hosts: 1
     with_gpus: true
+    mesh_name: policy
   ref_model:
     procs: ${ref_model.parallelism.tensor_parallel_degree}
     num_replicas: 1
     with_gpus: true
+    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
+    mesh_name: reward_actor
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 8
     hosts: 1
     with_gpus: true
+    mesh_name: trainer
   replay_buffer:
     procs: 1
     with_gpus: false
+    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
+    mesh_name: compute_advantages
diff --git a/apps/grpo/qwen3_8b.yaml b/apps/grpo/qwen3_8b.yaml
@@ -117,25 +117,32 @@ services:
     procs: ${policy.engine_config.tensor_parallel_size}
     num_replicas: 1
     with_gpus: true
+    mesh_name: policy
   ref_model:
     procs: 1
     num_replicas: 1
     with_gpus: true
+    mesh_name: ref_model
   reward_actor:
     procs: 1
     num_replicas: 1
     with_gpus: false
+    mesh_name: reward_actor
 
 actors:
   dataset:
     procs: 1
     with_gpus: false
+    mesh_name: dataset
   trainer:
     procs: 2
     with_gpus: true
+    mesh_name: trainer
   replay_buffer:
     procs: 1
     with_gpus: false
+    mesh_name: replay_buffer
   compute_advantages:
     procs: 1
     with_gpus: false
+    mesh_name: compute_advantages
diff --git a/src/forge/controller/provisioner.py b/src/forge/controller/provisioner.py
@@ -136,6 +136,7 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
             self._this_host_id: GpuManager(available_local_devices),
         }
         self._proc_host_map = {}
+        self._host_mesh_map = {}
         self.launcher: BaseLauncher | None = get_launcher(
             cfg.launcher_config if cfg is not None else None
         )
@@ -183,6 +184,9 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
             )
         return host_mesh, server_name
 
+    def get_host_mesh(self, name: str) -> HostMesh:
+        return self._host_mesh_map[name]
+
     async def get_proc_mesh(
         self,
         num_procs: int,
@@ -281,10 +285,16 @@ def bootstrap(env: dict[str, str]):
                 for env_var in all_env_vars():
                     env_vars[env_var.name] = str(env_var.get_value())
 
-            procs = host_mesh.spawn_procs(
-                per_host={"procs": num_procs},
-                bootstrap=functools.partial(bootstrap, env=env_vars),
-            )
+            if MONARCH_HOSTMESH_V1.get_value():
+                procs = host_mesh.spawn_procs(
+                    per_host={"procs": num_procs},
+                    setup=functools.partial(bootstrap, env=env_vars),
+                )
+            else:
+                procs = host_mesh.spawn_procs(
+                    per_host={"procs": num_procs},
+                    bootstrap=functools.partial(bootstrap, env=env_vars),
+                )
 
             if is_remote:
                 await self.launcher.remote_setup(procs)
@@ -293,6 +303,8 @@ def bootstrap(env: dict[str, str]):
             if with_gpus:
                 # Applies any launcher specific remote setup.
                 procs._gpu_ids = gpu_ids
+
+            self._host_mesh_map[mesh_name] = host_mesh
             procs._host = host_mesh
 
             # If we created a server, track so we can tear it down later.
diff --git a/src/forge/env.py b/src/forge/env.py
@@ -108,7 +108,7 @@ def get_value(self) -> Any:
 
 TORCHSTORE_USE_RDMA = EnvVar(
     name="TORCHSTORE_RDMA_ENABLED",
-    default=False,
+    default=1,
     description="Whether or not to use RDMA in TorchStore.",
 )
 

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,7 @@ def get_value(self) -> Any:`
`108`	`108`
`109`	`109`	`TORCHSTORE_USE_RDMA = EnvVar(`
`110`	`110`	`name="TORCHSTORE_RDMA_ENABLED",`
`111`		`- default=False,`
	`111`	`+ default=1,`
`112`	`112`	`description="Whether or not to use RDMA in TorchStore.",`
`113`	`113`	`)`
`114`	`114`