|
22 | 22 | ) |
23 | 23 | from areal.platforms import current_platform |
24 | 24 | from areal.utils import logging, name_resolve, names |
25 | | -from areal.utils.launcher import JobException, JobInfo, JobState, get_env_vars |
26 | | -from areal.utils.network import find_free_ports, gethostip |
| 25 | +from areal.utils.launcher import ( |
| 26 | + JobException, |
| 27 | + JobInfo, |
| 28 | + JobState, |
| 29 | + get_env_vars, |
| 30 | + wait_llm_server_addrs, |
| 31 | +) |
| 32 | +from areal.utils.network import find_free_ports |
27 | 33 | from areal.utils.recover import check_if_recover |
28 | 34 |
|
29 | 35 | logger = logging.getLogger("Local Scheduler") |
@@ -136,7 +142,9 @@ def submit_array( |
136 | 142 | ) |
137 | 143 | c = f"{c} 2>&1 | tee -a {self.log_path_of(job_name)}" |
138 | 144 | logger.info("Starting local process with command: %s", c) |
139 | | - process = subprocess.Popen(c, shell=isinstance(c, str)) |
| 145 | + process = subprocess.Popen( |
| 146 | + c, shell=isinstance(c, str), stdout=sys.stdout, stderr=sys.stdout |
| 147 | + ) |
140 | 148 | self._jobs[f"{job_name}/{offset + i}"] = process |
141 | 149 | self._job_counter[job_name] += 1 |
142 | 150 |
|
@@ -275,72 +283,64 @@ def local_main(config, run_id: int = 0): |
275 | 283 | f"run_id={run_id}, is_recover_run={is_recover_run}" |
276 | 284 | ) |
277 | 285 |
|
278 | | - server_cmd = [] |
279 | | - server_addrs = [] |
280 | | - if alloc_mode.gen_backend == "sglang": |
281 | | - base_seed = config.sglang.random_seed |
282 | | - config.sglang = to_structured_cfg(config.sglang, SGLangConfig) |
283 | | - ports = find_free_ports(alloc_mode.gen.dp_size * 2, port_range=(10000, 50000)) |
284 | | - host_ip = gethostip() |
285 | | - host = "localhost" if not config.sglang.enable_metrics else host_ip |
286 | | - for i in range(alloc_mode.gen.dp_size): |
287 | | - config.sglang.random_seed = base_seed + i |
288 | | - cmd = SGLangConfig.build_cmd( |
289 | | - config.sglang, |
290 | | - host=host, |
291 | | - tp_size=alloc_mode.gen.tp_size, |
292 | | - base_gpu_id=0, |
293 | | - port=ports[i * 2], |
294 | | - dist_init_addr=f"localhost:{ports[i*2+1]}", |
295 | | - ) |
296 | | - server_cmd.append(cmd) |
297 | | - server_addrs.append(f"{host}:{ports[i * 2]}") |
| 286 | + if alloc_mode.gen_backend in ("sglang", "vllm"): |
| 287 | + # Launcher should launch llm servers according to allocation mode. |
| 288 | + if alloc_mode.gen_backend == "sglang": |
| 289 | + config.sglang = to_structured_cfg(config.sglang, SGLangConfig) |
| 290 | + random_seed = config.sglang.random_seed |
| 291 | + else: |
| 292 | + config.vllm = to_structured_cfg(config.vllm, vLLMConfig) |
| 293 | + random_seed = config.vllm.seed |
| 294 | + |
| 295 | + backend_spec = { |
| 296 | + "sglang": { |
| 297 | + "module": "areal.launcher.sglang_server", |
| 298 | + "seed_arg": "sglang.random_seed", |
| 299 | + "set_device_env": False, |
| 300 | + }, |
| 301 | + "vllm": { |
| 302 | + "module": "areal.launcher.vllm_server", |
| 303 | + "seed_arg": "vllm.seed", |
| 304 | + "set_device_env": True, # vLLM needs `device_control_env_var` to control GPU allocation |
| 305 | + }, |
| 306 | + } |
| 307 | + |
| 308 | + spec = backend_spec[alloc_mode.gen_backend] |
| 309 | + |
| 310 | + base_seed = random_seed |
| 311 | + seed_arg = spec["seed_arg"] |
| 312 | + module = spec["module"] |
| 313 | + server_cmd = ( |
| 314 | + f"python3 -m {module} {' '.join(sys.argv[2:])} {seed_arg}={base_seed}" |
| 315 | + ) |
298 | 316 |
|
299 | 317 | # Launch inference servers. |
300 | 318 | launcher.submit_array( |
301 | 319 | job_name="llm_server", |
302 | 320 | cmd=server_cmd, |
303 | | - count=alloc_mode.gen.dp_size, |
304 | | - gpu=alloc_mode.gen.pp_size * alloc_mode.gen.tp_size, |
| 321 | + count=1, |
| 322 | + gpu=alloc_mode.gen.pp_size |
| 323 | + * alloc_mode.gen.tp_size |
| 324 | + * alloc_mode.gen.dp_size, |
305 | 325 | env_vars=get_env_vars( |
306 | 326 | config.cluster.cluster_name, |
307 | 327 | config.launcher.inference_server_env_vars, |
308 | 328 | ), |
309 | 329 | ) |
310 | | - logger.info( |
311 | | - f"LLM inference server launched at: AREAL_LLM_SERVER_ADDRS={','.join(server_addrs)}" |
312 | | - ) |
313 | | - elif alloc_mode.gen_backend == "vllm": |
314 | | - base_seed = config.vllm.seed |
315 | | - config.vllm = to_structured_cfg(config.vllm, vLLMConfig) |
316 | | - ports = find_free_ports(alloc_mode.gen.dp_size * 2, port_range=(10000, 50000)) |
317 | | - host = "localhost" |
318 | | - for i in range(alloc_mode.gen.dp_size): |
319 | | - config.vllm.seed = base_seed + i |
320 | | - cmd = vLLMConfig.build_cmd( |
321 | | - config.vllm, |
322 | | - host=host, |
323 | | - tp_size=alloc_mode.gen.tp_size, |
324 | | - port=ports[i * 2], |
325 | | - dist_init_addr=f"localhost:{ports[i*2+1]}", |
326 | | - ) |
327 | | - server_cmd.append(cmd) |
328 | | - server_addrs.append(f"{host}:{ports[i * 2]}") |
329 | 330 |
|
330 | | - # Launch inference servers. |
331 | | - launcher.submit_array( |
332 | | - job_name="llm_server", |
333 | | - cmd=server_cmd, |
334 | | - count=alloc_mode.gen.dp_size, |
335 | | - gpu=alloc_mode.gen.pp_size * alloc_mode.gen.tp_size, |
336 | | - env_vars=get_env_vars( |
337 | | - config.cluster.cluster_name, |
338 | | - config.launcher.inference_server_env_vars, |
339 | | - ), |
| 331 | + # Get llm server addresses by name resolve |
| 332 | + try: |
| 333 | + server_addrs = wait_llm_server_addrs( |
| 334 | + config.experiment_name, |
| 335 | + config.trial_name, |
| 336 | + n_rollout_servers=alloc_mode.gen.dp_size, |
340 | 337 | ) |
341 | 338 | logger.info( |
342 | 339 | f"LLM inference server launched at: AREAL_LLM_SERVER_ADDRS={','.join(server_addrs)}" |
343 | 340 | ) |
| 341 | + except (TimeoutError, KeyboardInterrupt) as e: |
| 342 | + launcher.stop_all(signal="SIGINT") |
| 343 | + raise e |
344 | 344 |
|
345 | 345 | # Launch trainer entrypoint |
346 | 346 | if alloc_mode.type_ != AllocationType.LLM_SERVER_ONLY: |
|
0 commit comments