fix: fix launching vLLM with ray (#420)

garrett4wade · web-flow · commit 3c0bd3dfa39e · 2025-10-09T15:38:44.000+08:00
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ state-of-the-art 7B and 32B models for mathematical reasoning. Check out our
 | Task                                             | Description                                                                          | Performance                                                                       |
 | ------------------------------------------------ | ------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
 | **[Math](examples/math/)**                       | Mathematical problem solving (SFT, GRPO, or PPO)                                     | TBA                                                                               |
-| **[Multi-Turn Math](examples/multi-turn-math/)** | Iterative mathematical problem solving with self-correction                          | [Training Curve](examples/multi-turn-math/reward_curve.png)     |
+| **[Multi-Turn Math](examples/multi-turn-math/)** | Iterative mathematical problem solving with self-correction                          | [Training Curve](examples/multi-turn-math/reward_curve.png)                       |
 | **[LoRA Math](examples/lora/)**                  | Math Agent Trained With LoRA                                                         | TBA                                                                               |
 | **[VLM Math](examples/vlm/)**                    | CLEVR visual counting tasks                                                          | TBA                                                                               |
 | **[Reasoning](examples/countdown/)**             | Countdown numbers game with custom rewards                                           | [Training Curve](/examples/countdown/countdown_training_curve.png)                |
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -295,7 +295,7 @@ class TrainEngineConfig:
     lora_alpha: int = field(default=16, metadata={"help": "lora alpha"})
     target_modules: List[str] = field(
         default_factory=list,
-        metadata={"help": "lora target_modules. None defaults to 'all-linear'"},
+        metadata={"help": "lora target_modules."},
     )
     peft_type: str = field(
         default="lora",
diff --git a/areal/launcher/sglang_server.py b/areal/launcher/sglang_server.py
@@ -6,6 +6,7 @@
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
 from typing import Optional
 
 import psutil
@@ -182,9 +183,10 @@ def run(self):
             host_ip = gethostip()
 
             base_gpu_id = (server_local_idx - server_idx_offset) * gpus_per_server
-            self.config.random_seed = base_random_seed + server_local_idx
+            config = deepcopy(self.config)
+            config.random_seed = base_random_seed + server_local_idx
             cmd = SGLangConfig.build_cmd(
-                self.config,
+                config,
                 tp_size=self.allocation_mode.gen.tp_size,
                 base_gpu_id=base_gpu_id,
                 host=host_ip,
diff --git a/areal/launcher/vllm_server.py b/areal/launcher/vllm_server.py
@@ -4,6 +4,7 @@
 import time
 import uuid
 from concurrent.futures import ThreadPoolExecutor
+from copy import deepcopy
 from typing import Optional
 
 import requests
@@ -105,6 +106,7 @@ def run(self):
             n_servers_per_proc = max(1, n_visible_devices // gpus_per_server)
             server_idx_offset = min(list(map(int, visible))) // gpus_per_server
         else:
+            visible = [str(i) for i in range(self.n_gpus_per_node)]
             n_servers_per_proc = n_servers_per_node
             server_idx_offset = 0
 
@@ -114,8 +116,8 @@ def run(self):
         launch_server_args = []
         server_addresses = []
         base_random_seed = self.config.seed
-        for server_local_idx in range(
-            server_idx_offset, server_idx_offset + n_servers_per_proc
+        for j, server_local_idx in enumerate(
+            range(server_idx_offset, server_idx_offset + n_servers_per_proc)
         ):
             port_range = (
                 server_local_idx * ports_per_server + 10000,
@@ -126,15 +128,15 @@ def run(self):
             dist_init_addr = f"localhost:{dist_init_port}"
             host_ip = gethostip()
 
-            base_gpu_id = (server_local_idx - server_idx_offset) * gpus_per_server
             custom_env = {
                 device_control_env_var: ",".join(
-                    map(str, range(base_gpu_id, base_gpu_id + gpus_per_server))
+                    visible[j * gpus_per_server : (j + 1) * gpus_per_server]
                 )
             }
-            self.config.seed = base_random_seed + server_local_idx
+            config = deepcopy(self.config)
+            config.seed = base_random_seed + server_local_idx
             cmd = vLLMConfig.build_cmd(
-                self.config,
+                config,
                 tp_size=self.allocation_mode.gen.tp_size,
                 host=host_ip,
                 port=server_port,
diff --git a/docs/algorithms/rloo.md b/docs/algorithms/rloo.md
@@ -6,7 +6,7 @@ Author: [Honghua DONG](https://github.com/dhh1995)
 
 ![rloo figure](../figures/reinforce.png)
 
-REINFORCE Leave One-Out (RLOO), introduced by Ahmadian et al. (2024), is an RL method that removes the need for a value function (critic). 
+REINFORCE Leave One-Out (RLOO), introduced by Ahmadian et al. (2024), is an RL method that removes the need for a value function (critic).
 Instead, it estimates the baseline by averaging rewards of other sampled responses for the same prompt within the group.
 
 The overall core objective is:
diff --git a/docs/cli_reference.md b/docs/cli_reference.md
@@ -333,7 +333,7 @@ Configuration for PPO actor model, a subclass of a TrainEngine.
 | `use_lora`                | boolean                                        | `False`               | Whether to use LoRA. Only support FSDP. Note that should be enabled together with vLLM/SGLang.                                                                                                                                                                                                                             |
 | `lora_rank`               | integer                                        | `32`                  | lora rank                                                                                                                                                                                                                                                                                                                  |
 | `lora_alpha`              | integer                                        | `16`                  | lora alpha                                                                                                                                                                                                                                                                                                                 |
-| `target_modules`          | list of string                                 | **Required**          | lora target_modules. None defaults to 'all-linear'                                                                                                                                                                                                                                                                         |
+| `target_modules`          | list of string                                 | **Required**          | lora target_modules.                                                                                                                                                                                                                                                                                                       |
 | `peft_type`               | string                                         | `"lora"`              | peft method type. Only LoRA is supported for now.                                                                                                                                                                                                                                                                          |
 | `group_size`              | integer                                        | `1`                   | Number of sequences in each group                                                                                                                                                                                                                                                                                          |
 | `ppo_n_minibatches`       | integer                                        | `4`                   | Number of minibatches for each PPO update                                                                                                                                                                                                                                                                                  |
@@ -388,7 +388,7 @@ Configuration for PPO critic model, a subclass of a TrainEngine.
 | `use_lora`               | boolean                                        | `False`               | Whether to use LoRA. Only support FSDP. Note that should be enabled together with vLLM/SGLang.                                          |
 | `lora_rank`              | integer                                        | `32`                  | lora rank                                                                                                                               |
 | `lora_alpha`             | integer                                        | `16`                  | lora alpha                                                                                                                              |
-| `target_modules`         | list of string                                 | **Required**          | lora target_modules. None defaults to 'all-linear'                                                                                      |
+| `target_modules`         | list of string                                 | **Required**          | lora target_modules.                                                                                                                    |
 | `peft_type`              | string                                         | `"lora"`              | peft method type. Only LoRA is supported for now.                                                                                       |
 | `ppo_n_minibatches`      | integer                                        | `4`                   | Number of minibatches for each PPO update                                                                                               |
 | `eps_clip`               | float                                          | `0.5`                 | Clipping factor for value loss                                                                                                          |
@@ -420,7 +420,7 @@ Core configuration for model training, including optimization and backend settin
 | `use_lora`               | boolean                                        | `False`               | Whether to use LoRA. Only support FSDP. Note that should be enabled together with vLLM/SGLang.                                          |
 | `lora_rank`              | integer                                        | `32`                  | lora rank                                                                                                                               |
 | `lora_alpha`             | integer                                        | `16`                  | lora alpha                                                                                                                              |
-| `target_modules`         | list of string                                 | **Required**          | lora target_modules. None defaults to 'all-linear'                                                                                      |
+| `target_modules`         | list of string                                 | **Required**          | lora target_modules.                                                                                                                    |
 | `peft_type`              | string                                         | `"lora"`              | peft method type. Only LoRA is supported for now.                                                                                       |
 
 (section-generation-hyperparameters)=

Original file line number	Diff line number	Diff line change
`@@ -295,7 +295,7 @@ class TrainEngineConfig:`
`295`	`295`	`lora_alpha: int = field(default=16, metadata={"help": "lora alpha"})`
`296`	`296`	`target_modules: List[str] = field(`
`297`	`297`	`default_factory=list,`
`298`		`- metadata={"help": "lora target_modules. None defaults to 'all-linear'"},`
	`298`	`+ metadata={"help": "lora target_modules."},`
`299`	`299`	`)`
`300`	`300`	`peft_type: str = field(`
`301`	`301`	`default="lora",`