Skip to content

Commit 3861859

Browse files
authored
Add bench results for frozenlake and alfworld (#416)
1 parent 34c3a53 commit 3861859

File tree

11 files changed

+414
-28
lines changed

11 files changed

+414
-28
lines changed

benchmark/bench.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,16 +105,27 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
105105
if dataset_name == "gsm8k" and taskset_path == "openai/gsm8k":
106106
return taskset_path
107107

108+
base_dir = os.path.dirname(__file__)
109+
frozenlake_data_script_path = os.path.abspath(
110+
os.path.join(
111+
base_dir,
112+
"..",
113+
"examples",
114+
"grpo_frozen_lake",
115+
"get_frozen_lake_data.py",
116+
)
117+
)
108118
dataset_script_map = {
109119
"countdown": "gen_countdown_data.py",
110120
"guru_math": "gen_guru_math_data.py",
121+
"alfworld": "get_alfworld_full_data.py",
122+
"frozenlake": frozenlake_data_script_path,
111123
}
112124
if dataset_name not in dataset_script_map:
113125
raise ValueError(
114126
f"Unsupported dataset: {dataset_name}. Please specify a valid taskset path."
115127
)
116128

117-
base_dir = os.path.dirname(__file__)
118129
script_filename = dataset_script_map[dataset_name]
119130
script_module_name = script_filename[:-3] # remove .py
120131

@@ -134,6 +145,13 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
134145
taskset_path = module.DEFAULT_DATA_PATH
135146
taskset_path = os.path.realpath(taskset_path)
136147

148+
# For frozenlake, check if train.parquet and test.parquet already exist
149+
if dataset_name == "frozenlake":
150+
train_path = os.path.join(taskset_path, "train.parquet")
151+
test_path = os.path.join(taskset_path, "test.parquet")
152+
if os.path.exists(train_path) and os.path.exists(test_path):
153+
return taskset_path
154+
137155
gen_script_path = os.path.join(base_dir, "scripts", script_filename)
138156
subprocess.run([sys.executable, gen_script_path, "--local_dir", taskset_path], check=True)
139157

@@ -168,11 +186,20 @@ def prepare_configs(args, rank, current_time):
168186
)
169187
if args.critic_lr:
170188
config["trainer"]["trainer_config"]["critic"]["optim"]["lr"] = args.critic_lr
189+
if args.dataset == "alfworld":
190+
print(
191+
"Warning: The current benchmark script of ALFWorld only supports GRPO; the SFT stage will be supported soon."
192+
)
171193
taskset_config = config["buffer"]["explorer_input"]["taskset"]
172194
taskset_config["path"] = check_taskset_path(
173195
args.dataset,
174196
args.taskset_path or os.environ.get("TASKSET_PATH") or taskset_config["path"],
175197
)
198+
eval_taskset_config = config["buffer"]["explorer_input"]["eval_tasksets"]
199+
if len(eval_taskset_config) > 0:
200+
# TODO: support seperately set path for eval taskset
201+
for eval_taskset_config in eval_taskset_config:
202+
eval_taskset_config["path"] = taskset_config["path"]
176203
if args.lr:
177204
config["algorithm"]["optimizer"]["lr"] = args.lr
178205
if args.sync_interval:
@@ -236,7 +263,11 @@ def main(args):
236263

237264
if __name__ == "__main__":
238265
parser = argparse.ArgumentParser()
239-
parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"])
266+
parser.add_argument(
267+
"dataset",
268+
type=str.lower,
269+
choices=["gsm8k", "countdown", "guru_math", "alfworld", "frozenlake"],
270+
)
240271
parser.add_argument(
241272
"--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC."
242273
)
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
mode: both
2+
project: Trinity-RFT
3+
group: ${oc.env:TRINITY_GROUP,os-bench}
4+
name: ${oc.env:TRINITY_NAME,alfworld}
5+
checkpoint_root_dir: placeholder
6+
algorithm:
7+
algorithm_type: grpo
8+
repeat_times: 8
9+
loss_agg_mode: "seq-mean-token-sum"
10+
optimizer:
11+
lr: 1e-6
12+
sample_strategy: warmup
13+
policy_loss_fn: ppo
14+
advantage_fn: grpo
15+
kl_penalty_fn: none
16+
kl_loss_fn: k2
17+
entropy_loss_fn: default
18+
kl_loss_fn_args:
19+
kl_coef: 0.001
20+
data_processor: {}
21+
model:
22+
model_path: placeholder
23+
max_prompt_tokens: 10240
24+
max_response_tokens: 4096
25+
cluster:
26+
node_num: 1
27+
gpu_per_node: 8
28+
buffer:
29+
batch_size: 32
30+
total_epochs: 5
31+
explorer_input:
32+
taskset:
33+
name: alfworld
34+
split: train
35+
storage_type: file
36+
path: null
37+
format:
38+
prompt_key: 'game_file'
39+
rollout_args:
40+
temperature: 1.0
41+
logprobs: 0
42+
eval_tasksets:
43+
- name: alfworld
44+
split: test
45+
storage_type: file
46+
path: null
47+
format:
48+
prompt_key: 'game_file'
49+
rollout_args:
50+
temperature: 1.0
51+
logprobs: 0
52+
default_workflow_type: 'alfworld_workflow'
53+
explorer:
54+
eval_on_startup: true
55+
eval_interval: 10
56+
runner_per_model: 8
57+
max_timeout: 3600
58+
max_retry_times: 2
59+
rollout_model:
60+
engine_num: 4
61+
tensor_parallel_size: 1
62+
enforce_eager: false
63+
enable_prefix_caching: false
64+
enable_chunked_prefill: true
65+
gpu_memory_utilization: 0.7
66+
dtype: bfloat16
67+
seed: 42
68+
enable_thinking: false
69+
enable_openai_api: false
70+
auxiliary_models: []
71+
bench_on_latest_checkpoint: true
72+
trainer:
73+
trainer_type: verl
74+
save_interval: 1000
75+
enable_preview: true
76+
grad_clip: 1.0
77+
use_dynamic_bsz: true
78+
max_token_len_per_gpu: 16384
79+
ulysses_sequence_parallel_size: 1
80+
monitor:
81+
monitor_type: wandb
82+
synchronizer:
83+
sync_method: nccl
84+
sync_style: fixed
85+
sync_interval: 1
86+
sync_timeout: 3600
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
mode: both
2+
project: Trinity-RFT
3+
group: ${oc.env:TRINITY_GROUP,frozenlake-bench}
4+
name: ${oc.env:TRINITY_NAME,frozenlake}
5+
checkpoint_root_dir: placeholder
6+
algorithm:
7+
algorithm_type: grpo
8+
repeat_times: 8
9+
loss_agg_mode: "seq-mean-token-sum"
10+
optimizer:
11+
lr: 1e-6
12+
policy_loss_fn: ppo
13+
advantage_fn: grpo
14+
kl_penalty_fn: none
15+
kl_loss_fn: k2
16+
entropy_loss_fn: default
17+
policy_loss_fn_args:
18+
clip_range_low: 0.2
19+
clip_range_high: 0.28
20+
kl_loss_fn_args:
21+
kl_coef: 0.0
22+
data_processor: {}
23+
model:
24+
model_path: Qwen/Qwen2.5-3B-Instruct
25+
max_prompt_tokens: 4096
26+
max_response_tokens: 10240
27+
cluster:
28+
node_num: 1
29+
gpu_per_node: 8
30+
buffer:
31+
batch_size: 64
32+
total_epochs: 3
33+
explorer_input:
34+
taskset:
35+
name: frozenlake
36+
storage_type: file
37+
path: null
38+
split: train
39+
workflow_args:
40+
env_max_steps: 8
41+
agent_max_steps: 10
42+
is_slippery: false
43+
eval_tasksets:
44+
- name: frozenlake
45+
storage_type: file
46+
path: null
47+
split: test
48+
workflow_args:
49+
env_max_steps: 8
50+
agent_max_steps: 10
51+
is_slippery: false
52+
repeat_times: 4
53+
rollout_args:
54+
temperature: 0.7
55+
top_p: 0.8
56+
top_k: 20
57+
default_workflow_type: 'frozen_lake_workflow'
58+
explorer:
59+
eval_on_startup: true
60+
eval_interval: 10
61+
runner_per_model: 8
62+
max_timeout: 900
63+
max_retry_times: 2
64+
rollout_model:
65+
engine_num: 4
66+
tensor_parallel_size: 1
67+
enforce_eager: false
68+
enable_prefix_caching: false
69+
enable_chunked_prefill: true
70+
gpu_memory_utilization: 0.85
71+
dtype: bfloat16
72+
seed: 42
73+
enable_thinking: false
74+
enable_openai_api: false
75+
auxiliary_models: []
76+
bench_on_latest_checkpoint: true
77+
trainer:
78+
trainer_type: verl
79+
save_interval: 1000
80+
enable_preview: true
81+
grad_clip: 1.0
82+
use_dynamic_bsz: true
83+
max_token_len_per_gpu: 16384
84+
ulysses_sequence_parallel_size: 1
85+
# monitor:
86+
# monitor_type: wandb
87+
synchronizer:
88+
sync_method: nccl
89+
sync_style: fixed
90+
sync_interval: 1
91+
sync_timeout: 1200

benchmark/reports/alfworld.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# ALFWorld Benchmark Results
2+
3+
## 1. Task Introduction
4+
5+
[ALFWorld](https://github.com/alfworld/alfworld) is a text-based interactive environment where agents need to complete household tasks in a virtual home environment. The agent interacts with the environment through natural language commands to accomplish tasks.
6+
7+
The environment is configured as follows:
8+
* Environment: Text-based interactive environment built on TextWorld
9+
* Action Space: Commands such as `pick`, `go to`, `place`, etc.
10+
* Reward Structure: +1 for successfully completing the task, -0.1 otherwise
11+
* Maximum Steps: 30 (configurable via `max_env_steps`)
12+
13+
See the [documentation](https://modelscope.github.io/Trinity-RFT/en/main/tutorial/example_multi_turn.html) for data preparation.
14+
15+
## 2. Experimental Settings
16+
17+
We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
18+
Since rLLM does not support ALFWorld environment yet, we implement this task in rLLM for comparison.
19+
20+
In Trinity-RFT and rLLM, we respectively evaluate the performance using GRPO algorithm on this task.
21+
We fine-tune a `Qwen2.5-3B-Instruct` model, which has been trained on a SFT dataset, on the training tasks with GRPO and other methods. For all methods, we fix key parameters to `batch_size=32`, `repeat_times=8`, `lr=1e-6`, and `kl_coef=0.001`.
22+
23+
For better efficiency, we use 64 rollout workers in rLLM and set the `explorer.engine_num` to 4 and `explorer.runner_per_model` to 8 in Trinity-RFT.
24+
25+
## 3. Results and Analysis
26+
27+
We compare the sample efficiency of different methods by plotting the reward and test score vs. training steps. As shown in the following figures, Trinity-RFT and rLLM reach similar training and test results at the same step.
28+
29+
![](../../docs/sphinx_doc/assets/bench_alfworld_step.png)
30+
31+
We further compare the efficiency on the ALFWorld task.
32+
The following table details the wall-clock time required for each method to reach the specific performance thresholds, i.e., reward = 0.8 and test score = 0.6.
33+
34+
| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
35+
|----------|------------------|-------------------------------|---------|
36+
| rLLM | 0.830 | 9.33 | - |
37+
| Trinity-RFT | 0.826 | 2.53 | 3.69× |
38+
39+
40+
| Method | Test Score | Time to Reach Target (Hours) | Speedup |
41+
|----------|------------------|-------------------------------|---------|
42+
| rLLM | 0.670 | 6.65 | - |
43+
| Trinity-RFT | 0.632 | 1.14 | 5.83× |
44+
45+
The results show that the Trinity-RFT achieves a noticeable speedup on the ALFWorld task, also shown in the following figures.
46+
The primary reason for the efficiency lies in the difference between the rollout mechanisms of Trinity-RFT and rLLM. Trinity-RFT uses multiprocessing during rollout, whereas rLLM employs multithreading, which restricts the parallelism of the rollout process in ALFWorld environment given that this environment is not thread-safe (refer to [this issue](https://github.com/alfworld/alfworld/issues/71)).
47+
48+
![](../../docs/sphinx_doc/assets/bench_alfworld_time.png)

benchmark/reports/frozenlake.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Frozen Lake Benchmark Results
2+
3+
## 1. Task Introduction
4+
5+
[Frozen lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) involves walking over a frozen lake from Start (S) to Goal (G) without falling into any Holes (H). We formulate this task as a multi-step workflow, where the agent interacts with the environment for multiple steps to reach the goal.
6+
7+
The environment is configured as follows:
8+
* Map Size: From 2x2 to 5x5, randomly generated.
9+
* Mode: Non-Slippery
10+
* Action Space: Up, Down, Left, Right
11+
* Reward Structure: +1 for reaching the goal, 0 otherwise.
12+
13+
The training and test data are generated by the following script:
14+
```bash
15+
python examples/grpo_frozen_lake/get_frozen_lake_data.py
16+
```
17+
This command generates 10000 training tasks and 100 test tasks.
18+
19+
To filter the unsolvable tasks, we restrict the game map to have a valid path within `env_map_steps=8` steps. Moreover, the agent can take at most `agent_max_steps=10` steps to reach the goal.
20+
21+
22+
## 2. Experimental Settings
23+
24+
We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
25+
26+
We fine-tune a Qwen2.5-3B-Instruct model using the training tasks with GRPO. For all experiments, we fix key parameters to `batch_size=64`, `repeat_times=8`, and `lr=1e-6`. We run each experiment for three times and report the average results.
27+
28+
For fair comparison, we optimize the configurations related to the training efficiency to achieve better performance. For rLLM, we adopt the default configurations in ` examples/frozenlake/train_frozenlake_agent.sh` except that we increase the batch size to 64 for stability and set the number of rollout workers to 64 for efficiency. For Trinity-RFT, we set the `explorer.engine_num` to 4 for efficiency.
29+
30+
## 3. Results and Analysis
31+
32+
We compare the sample efficiency of different methods by plotting the reward and test score in the following figures. At the same step, Trinity-RFT and rLLM achieve similar rewards and test scores, verifying the training correctness.
33+
34+
![](../../docs/sphinx_doc/assets/bench_frozenlake_step.png)
35+
36+
The following table details the wall-clock time required for each method to reach a specific performance threshold. From the results, Trinity-RFT requires less time to reach the target performance, i.e., reward=0.6, reward=0.8, and test score=0.8.
37+
38+
| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
39+
|----------|------------------|-------------------------------|---------|
40+
| rLLM | 0.6023 | 3.967 | - |
41+
| Trinity-RFT | 0.6188 | 2.87 | 1.38× |
42+
| rLLM | 0.8007 | 5.91 | - |
43+
| Trinity-RFT | 0.8033 | 5.44 | 1.09× |
44+
45+
| Method | Test Score | Time to Reach Target (Hours) | Speedup |
46+
|----------|------------------|-------------------------------|---------|
47+
| rLLM | 0.8096 | 6.82 | - |
48+
| Trinity-RFT | 0.8262 | 5.15 | 1.32× |

0 commit comments

Comments
 (0)