Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions benchmark/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,27 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
if dataset_name == "gsm8k" and taskset_path == "openai/gsm8k":
return taskset_path

base_dir = os.path.dirname(__file__)
frozenlake_data_script_path = os.path.abspath(
os.path.join(
base_dir,
"..",
"examples",
"grpo_frozen_lake",
"get_frozen_lake_data.py",
)
)
dataset_script_map = {
"countdown": "gen_countdown_data.py",
"guru_math": "gen_guru_math_data.py",
"alfworld": "get_alfworld_full_data.py",
"frozenlake": frozenlake_data_script_path,
}
if dataset_name not in dataset_script_map:
raise ValueError(
f"Unsupported dataset: {dataset_name}. Please specify a valid taskset path."
)

base_dir = os.path.dirname(__file__)
script_filename = dataset_script_map[dataset_name]
script_module_name = script_filename[:-3] # remove .py

Expand All @@ -134,6 +145,13 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
taskset_path = module.DEFAULT_DATA_PATH
taskset_path = os.path.realpath(taskset_path)

# For frozenlake, check if train.parquet and test.parquet already exist
if dataset_name == "frozenlake":
train_path = os.path.join(taskset_path, "train.parquet")
test_path = os.path.join(taskset_path, "test.parquet")
if os.path.exists(train_path) and os.path.exists(test_path):
return taskset_path

gen_script_path = os.path.join(base_dir, "scripts", script_filename)
subprocess.run([sys.executable, gen_script_path, "--local_dir", taskset_path], check=True)

Expand Down Expand Up @@ -168,11 +186,20 @@ def prepare_configs(args, rank, current_time):
)
if args.critic_lr:
config["trainer"]["trainer_config"]["critic"]["optim"]["lr"] = args.critic_lr
if args.dataset == "alfworld":
print(
"Warning: The current benchmark script of ALFWorld only supports GRPO; the SFT stage will be supported soon."
)
taskset_config = config["buffer"]["explorer_input"]["taskset"]
taskset_config["path"] = check_taskset_path(
args.dataset,
args.taskset_path or os.environ.get("TASKSET_PATH") or taskset_config["path"],
)
eval_taskset_config = config["buffer"]["explorer_input"]["eval_tasksets"]
if len(eval_taskset_config) > 0:
# TODO: support seperately set path for eval taskset
for eval_taskset_config in eval_taskset_config:
eval_taskset_config["path"] = taskset_config["path"]
if args.lr:
config["algorithm"]["optimizer"]["lr"] = args.lr
if args.sync_interval:
Expand Down Expand Up @@ -236,7 +263,11 @@ def main(args):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"])
parser.add_argument(
"dataset",
type=str.lower,
choices=["gsm8k", "countdown", "guru_math", "alfworld", "frozenlake"],
)
parser.add_argument(
"--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC."
)
Expand Down
86 changes: 86 additions & 0 deletions benchmark/config/alfworld-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
mode: both
project: Trinity-RFT
group: ${oc.env:TRINITY_GROUP,os-bench}
name: ${oc.env:TRINITY_NAME,alfworld}
checkpoint_root_dir: placeholder
algorithm:
algorithm_type: grpo
repeat_times: 8
loss_agg_mode: "seq-mean-token-sum"
optimizer:
lr: 1e-6
sample_strategy: warmup
policy_loss_fn: ppo
advantage_fn: grpo
kl_penalty_fn: none
kl_loss_fn: k2
entropy_loss_fn: default
kl_loss_fn_args:
kl_coef: 0.001
data_processor: {}
model:
model_path: placeholder
max_prompt_tokens: 10240
max_response_tokens: 4096
cluster:
node_num: 1
gpu_per_node: 8
buffer:
batch_size: 32
total_epochs: 5
explorer_input:
taskset:
name: alfworld
split: train
storage_type: file
path: null
format:
prompt_key: 'game_file'
rollout_args:
temperature: 1.0
logprobs: 0
eval_tasksets:
- name: alfworld
split: test
storage_type: file
path: null
format:
prompt_key: 'game_file'
rollout_args:
temperature: 1.0
logprobs: 0
default_workflow_type: 'alfworld_workflow'
explorer:
eval_on_startup: true
eval_interval: 10
runner_per_model: 8
max_timeout: 3600
max_retry_times: 2
rollout_model:
engine_num: 4
tensor_parallel_size: 1
enforce_eager: false
enable_prefix_caching: false
enable_chunked_prefill: true
gpu_memory_utilization: 0.7
dtype: bfloat16
seed: 42
enable_thinking: false
enable_openai_api: false
auxiliary_models: []
bench_on_latest_checkpoint: true
trainer:
trainer_type: verl
save_interval: 1000
enable_preview: true
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
monitor:
monitor_type: wandb
synchronizer:
sync_method: nccl
sync_style: fixed
sync_interval: 1
sync_timeout: 3600
91 changes: 91 additions & 0 deletions benchmark/config/frozenlake-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
mode: both
project: Trinity-RFT
group: ${oc.env:TRINITY_GROUP,frozenlake-bench}
name: ${oc.env:TRINITY_NAME,frozenlake}
checkpoint_root_dir: placeholder
algorithm:
algorithm_type: grpo
repeat_times: 8
loss_agg_mode: "seq-mean-token-sum"
optimizer:
lr: 1e-6
policy_loss_fn: ppo
advantage_fn: grpo
kl_penalty_fn: none
kl_loss_fn: k2
entropy_loss_fn: default
policy_loss_fn_args:
clip_range_low: 0.2
clip_range_high: 0.28
kl_loss_fn_args:
kl_coef: 0.0
data_processor: {}
model:
model_path: Qwen/Qwen2.5-3B-Instruct
max_prompt_tokens: 4096
max_response_tokens: 10240
cluster:
node_num: 1
gpu_per_node: 8
buffer:
batch_size: 64
total_epochs: 3
explorer_input:
taskset:
name: frozenlake
storage_type: file
path: null
split: train
workflow_args:
env_max_steps: 8
agent_max_steps: 10
is_slippery: false
eval_tasksets:
- name: frozenlake
storage_type: file
path: null
split: test
workflow_args:
env_max_steps: 8
agent_max_steps: 10
is_slippery: false
repeat_times: 4
rollout_args:
temperature: 0.7
top_p: 0.8
top_k: 20
default_workflow_type: 'frozen_lake_workflow'
explorer:
eval_on_startup: true
eval_interval: 10
runner_per_model: 8
max_timeout: 900
max_retry_times: 2
rollout_model:
engine_num: 4
tensor_parallel_size: 1
enforce_eager: false
enable_prefix_caching: false
enable_chunked_prefill: true
gpu_memory_utilization: 0.85
dtype: bfloat16
seed: 42
enable_thinking: false
enable_openai_api: false
auxiliary_models: []
bench_on_latest_checkpoint: true
trainer:
trainer_type: verl
save_interval: 1000
enable_preview: true
grad_clip: 1.0
use_dynamic_bsz: true
max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
# monitor:
# monitor_type: wandb
synchronizer:
sync_method: nccl
sync_style: fixed
sync_interval: 1
sync_timeout: 1200
48 changes: 48 additions & 0 deletions benchmark/reports/alfworld.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# ALFWorld Benchmark Results

## 1. Task Introduction

[ALFWorld](https://github.com/alfworld/alfworld) is a text-based interactive environment where agents need to complete household tasks in a virtual home environment. The agent interacts with the environment through natural language commands to accomplish tasks.

The environment is configured as follows:
* Environment: Text-based interactive environment built on TextWorld
* Action Space: Commands such as `pick`, `go to`, `place`, etc.
* Reward Structure: +1 for successfully completing the task, -0.1 otherwise
* Maximum Steps: 30 (configurable via `max_env_steps`)

See the [documentation](https://modelscope.github.io/Trinity-RFT/en/main/tutorial/example_multi_turn.html) for data preparation.

## 2. Experimental Settings

We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
Since rLLM does not support ALFWorld environment yet, we implement this task in rLLM for comparison.

In Trinity-RFT and rLLM, we respectively evaluate the performance using GRPO algorithm on this task.
We fine-tune a `Qwen2.5-3B-Instruct` model, which has been trained on a SFT dataset, on the training tasks with GRPO and other methods. For all methods, we fix key parameters to `batch_size=32`, `repeat_times=8`, `lr=1e-6`, and `kl_coef=0.001`.

For better efficiency, we use 64 rollout workers in rLLM and set the `explorer.engine_num` to 4 and `explorer.runner_per_model` to 8 in Trinity-RFT.

## 3. Results and Analysis

We compare the sample efficiency of different methods by plotting the reward and test score vs. training steps. As shown in the following figures, Trinity-RFT and rLLM reach similar training and test results at the same step.

![](../../docs/sphinx_doc/assets/bench_alfworld_step.png)

We further compare the efficiency on the ALFWorld task.
The following table details the wall-clock time required for each method to reach the specific performance thresholds, i.e., reward = 0.8 and test score = 0.6.

| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
|----------|------------------|-------------------------------|---------|
| rLLM | 0.830 | 9.33 | - |
| Trinity-RFT | 0.826 | 2.53 | 3.69× |


| Method | Test Score | Time to Reach Target (Hours) | Speedup |
|----------|------------------|-------------------------------|---------|
| rLLM | 0.670 | 6.65 | - |
| Trinity-RFT | 0.632 | 1.14 | 5.83× |

The results show that the Trinity-RFT achieves a noticeable speedup on the ALFWorld task, also shown in the following figures.
The primary reason for the efficiency lies in the difference between the rollout mechanisms of Trinity-RFT and rLLM. Trinity-RFT uses multiprocessing during rollout, whereas rLLM employs multithreading, which restricts the parallelism of the rollout process in ALFWorld environment given that this environment is not thread-safe (refer to [this issue](https://github.com/alfworld/alfworld/issues/71)).

![](../../docs/sphinx_doc/assets/bench_alfworld_time.png)
48 changes: 48 additions & 0 deletions benchmark/reports/frozenlake.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Frozen Lake Benchmark Results

## 1. Task Introduction

[Frozen lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) involves walking over a frozen lake from Start (S) to Goal (G) without falling into any Holes (H). We formulate this task as a multi-step workflow, where the agent interacts with the environment for multiple steps to reach the goal.

The environment is configured as follows:
* Map Size: From 2x2 to 5x5, randomly generated.
* Mode: Non-Slippery
* Action Space: Up, Down, Left, Right
* Reward Structure: +1 for reaching the goal, 0 otherwise.

The training and test data are generated by the following script:
```bash
python examples/grpo_frozen_lake/get_frozen_lake_data.py
```
This command generates 10000 training tasks and 100 test tasks.

To filter the unsolvable tasks, we restrict the game map to have a valid path within `env_map_steps=8` steps. Moreover, the agent can take at most `agent_max_steps=10` steps to reach the goal.


## 2. Experimental Settings

We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.

We fine-tune a Qwen2.5-3B-Instruct model using the training tasks with GRPO. For all experiments, we fix key parameters to `batch_size=64`, `repeat_times=8`, and `lr=1e-6`. We run each experiment for three times and report the average results.

For fair comparison, we optimize the configurations related to the training efficiency to achieve better performance. For rLLM, we adopt the default configurations in ` examples/frozenlake/train_frozenlake_agent.sh` except that we increase the batch size to 64 for stability and set the number of rollout workers to 64 for efficiency. For Trinity-RFT, we set the `explorer.engine_num` to 4 for efficiency.

## 3. Results and Analysis

We compare the sample efficiency of different methods by plotting the reward and test score in the following figures. At the same step, Trinity-RFT and rLLM achieve similar rewards and test scores, verifying the training correctness.

![](../../docs/sphinx_doc/assets/bench_frozenlake_step.png)

The following table details the wall-clock time required for each method to reach a specific performance threshold. From the results, Trinity-RFT requires less time to reach the target performance, i.e., reward=0.6, reward=0.8, and test score=0.8.

| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
|----------|------------------|-------------------------------|---------|
| rLLM | 0.6023 | 3.967 | - |
| Trinity-RFT | 0.6188 | 2.87 | 1.38× |
| rLLM | 0.8007 | 5.91 | - |
| Trinity-RFT | 0.8033 | 5.44 | 1.09× |

| Method | Test Score | Time to Reach Target (Hours) | Speedup |
|----------|------------------|-------------------------------|---------|
| rLLM | 0.8096 | 6.82 | - |
| Trinity-RFT | 0.8262 | 5.15 | 1.32× |
Loading