Skip to content

Commit 37cb96d

Browse files
committed
changes for running
1 parent 9c3bf46 commit 37cb96d

File tree

6 files changed

+60
-7
lines changed

6 files changed

+60
-7
lines changed

apps/grpo/main.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from forge.controller.actor import ForgeActor
2929
from forge.controller.provisioner import init_provisioner, shutdown
3030
from forge.data.rewards import MathReward, ThinkingReward
31+
from forge.env import MONARCH_HOSTMESH_V1
3132
from forge.observability.metric_actors import get_or_create_metric_logger
3233
from forge.observability.metrics import record_metric, Reduce
3334
from forge.observability.perf_tracker import Tracer
@@ -314,14 +315,23 @@ async def main(cfg: DictConfig):
314315
max_res_tokens = cfg.max_res_tokens
315316

316317
# ---- Global setups ---- #
318+
provisioner = None
317319
if cfg.get("provisioner", None) is not None:
318-
await init_provisioner(
320+
provisioner = await init_provisioner(
319321
ProvisionerConfig(launcher_config=LauncherConfig(**cfg.provisioner))
320322
)
323+
else:
324+
provisioner = await init_provisioner()
325+
321326
metric_logging_cfg = cfg.get("metric_logging", {"console": {"log_per_rank": False}})
322327
mlogger = await get_or_create_metric_logger()
323328
await mlogger.init_backends.call_one(metric_logging_cfg)
324-
await ts.initialize(strategy=ts.ControllerStorageVolumes())
329+
330+
# In the host mesh v0 case, actors on remote hosts are not able to communicate
331+
# with one another. Therefore we use the controller as our storage volume.
332+
if provisioner is None or not MONARCH_HOSTMESH_V1.get_value():
333+
await ts.initialize(strategy=ts.ControllerStorageVolumes())
334+
print("Torchstore successfuly initialized with controller storage strategy")
325335

326336
# ---- Setup services ---- #
327337

@@ -351,6 +361,16 @@ async def main(cfg: DictConfig):
351361

352362
print("All services initialized successfully!")
353363

364+
if provisioner is not None and MONARCH_HOSTMESH_V1.get_value():
365+
# TODO: support multiple host meshes
366+
trainer_host_mesh_name = cfg.actors.trainer["mesh_name"]
367+
trainer_hosts = provisioner.get_host_mesh(trainer_host_mesh_name)
368+
await ts.initialize(
369+
mesh=trainer_hosts.spawn_procs(per_host={"gpus": 8}),
370+
strategy=ts.LocalRankStrategy(),
371+
)
372+
print("Torchstore successfuly initialized with local rank strategy")
373+
354374
# ---- Core RL loops ---- #
355375
async def continuous_rollouts():
356376
rollout_count = 0

apps/grpo/qwen3_1_7b.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,26 +117,33 @@ services:
117117
policy:
118118
procs: ${policy.engine_config.tensor_parallel_size}
119119
num_replicas: 1
120+
mesh_name: policy
120121
with_gpus: true
121122
ref_model:
122123
procs: 1
123124
num_replicas: 1
125+
mesh_name: ref_model
124126
with_gpus: true
125127
reward_actor:
126128
procs: 1
127129
num_replicas: 1
130+
mesh_name: reward_actor
128131
with_gpus: false
129132

130133
actors:
131134
dataset:
132135
procs: 1
133136
with_gpus: false
137+
mesh_name: dataset
134138
trainer:
135139
procs: 1
136140
with_gpus: true
141+
mesh_name: trainer
137142
replay_buffer:
138143
procs: 1
139144
with_gpus: false
145+
mesh_name: replay_buffer
140146
compute_advantages:
141147
procs: 1
142148
with_gpus: false
149+
mesh_name: compute_advantages

apps/grpo/qwen3_32b.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,26 +122,33 @@ services:
122122
num_replicas: 1
123123
hosts: 1
124124
with_gpus: true
125+
mesh_name: policy
125126
ref_model:
126127
procs: ${ref_model.parallelism.tensor_parallel_degree}
127128
num_replicas: 1
128129
with_gpus: true
130+
mesh_name: ref_model
129131
reward_actor:
130132
procs: 1
131133
num_replicas: 1
132134
with_gpus: false
135+
mesh_name: reward_actor
133136

134137
actors:
135138
dataset:
136139
procs: 1
137140
with_gpus: false
141+
mesh_name: dataset
138142
trainer:
139143
procs: 8
140144
hosts: 1
141145
with_gpus: true
146+
mesh_name: trainer
142147
replay_buffer:
143148
procs: 1
144149
with_gpus: false
150+
mesh_name: replay_buffer
145151
compute_advantages:
146152
procs: 1
147153
with_gpus: false
154+
mesh_name: compute_advantages

apps/grpo/qwen3_8b.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,25 +117,32 @@ services:
117117
procs: ${policy.engine_config.tensor_parallel_size}
118118
num_replicas: 1
119119
with_gpus: true
120+
mesh_name: policy
120121
ref_model:
121122
procs: 1
122123
num_replicas: 1
123124
with_gpus: true
125+
mesh_name: ref_model
124126
reward_actor:
125127
procs: 1
126128
num_replicas: 1
127129
with_gpus: false
130+
mesh_name: reward_actor
128131

129132
actors:
130133
dataset:
131134
procs: 1
132135
with_gpus: false
136+
mesh_name: dataset
133137
trainer:
134138
procs: 2
135139
with_gpus: true
140+
mesh_name: trainer
136141
replay_buffer:
137142
procs: 1
138143
with_gpus: false
144+
mesh_name: replay_buffer
139145
compute_advantages:
140146
procs: 1
141147
with_gpus: false
148+
mesh_name: compute_advantages

src/forge/controller/provisioner.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def __init__(self, cfg: ProvisionerConfig | None = None):
136136
self._this_host_id: GpuManager(available_local_devices),
137137
}
138138
self._proc_host_map = {}
139+
self._host_mesh_map = {}
139140
self.launcher: BaseLauncher | None = get_launcher(
140141
cfg.launcher_config if cfg is not None else None
141142
)
@@ -183,6 +184,9 @@ async def create_host_mesh(self, name: str, num_hosts: int) -> HostMesh:
183184
)
184185
return host_mesh, server_name
185186

187+
def get_host_mesh(self, name: str) -> HostMesh:
188+
return self._host_mesh_map[name]
189+
186190
async def get_proc_mesh(
187191
self,
188192
num_procs: int,
@@ -281,10 +285,16 @@ def bootstrap(env: dict[str, str]):
281285
for env_var in all_env_vars():
282286
env_vars[env_var.name] = str(env_var.get_value())
283287

284-
procs = host_mesh.spawn_procs(
285-
per_host={"procs": num_procs},
286-
bootstrap=functools.partial(bootstrap, env=env_vars),
287-
)
288+
if MONARCH_HOSTMESH_V1.get_value():
289+
procs = host_mesh.spawn_procs(
290+
per_host={"procs": num_procs},
291+
setup=functools.partial(bootstrap, env=env_vars),
292+
)
293+
else:
294+
procs = host_mesh.spawn_procs(
295+
per_host={"procs": num_procs},
296+
bootstrap=functools.partial(bootstrap, env=env_vars),
297+
)
288298

289299
if is_remote:
290300
await self.launcher.remote_setup(procs)
@@ -293,6 +303,8 @@ def bootstrap(env: dict[str, str]):
293303
if with_gpus:
294304
# Applies any launcher specific remote setup.
295305
procs._gpu_ids = gpu_ids
306+
307+
self._host_mesh_map[mesh_name] = host_mesh
296308
procs._host = host_mesh
297309

298310
# If we created a server, track so we can tear it down later.

src/forge/env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def get_value(self) -> Any:
108108

109109
TORCHSTORE_USE_RDMA = EnvVar(
110110
name="TORCHSTORE_RDMA_ENABLED",
111-
default=False,
111+
default=1,
112112
description="Whether or not to use RDMA in TorchStore.",
113113
)
114114

0 commit comments

Comments
 (0)