diff --git a/benchmark/bench.py b/benchmark/bench.py index 6adf062d9d..ac336d904a 100644 --- a/benchmark/bench.py +++ b/benchmark/bench.py @@ -105,16 +105,27 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str: if dataset_name == "gsm8k" and taskset_path == "openai/gsm8k": return taskset_path + base_dir = os.path.dirname(__file__) + frozenlake_data_script_path = os.path.abspath( + os.path.join( + base_dir, + "..", + "examples", + "grpo_frozen_lake", + "get_frozen_lake_data.py", + ) + ) dataset_script_map = { "countdown": "gen_countdown_data.py", "guru_math": "gen_guru_math_data.py", + "alfworld": "get_alfworld_full_data.py", + "frozenlake": frozenlake_data_script_path, } if dataset_name not in dataset_script_map: raise ValueError( f"Unsupported dataset: {dataset_name}. Please specify a valid taskset path." ) - base_dir = os.path.dirname(__file__) script_filename = dataset_script_map[dataset_name] script_module_name = script_filename[:-3] # remove .py @@ -134,6 +145,13 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str: taskset_path = module.DEFAULT_DATA_PATH taskset_path = os.path.realpath(taskset_path) + # For frozenlake, check if train.parquet and test.parquet already exist + if dataset_name == "frozenlake": + train_path = os.path.join(taskset_path, "train.parquet") + test_path = os.path.join(taskset_path, "test.parquet") + if os.path.exists(train_path) and os.path.exists(test_path): + return taskset_path + gen_script_path = os.path.join(base_dir, "scripts", script_filename) subprocess.run([sys.executable, gen_script_path, "--local_dir", taskset_path], check=True) @@ -168,11 +186,20 @@ def prepare_configs(args, rank, current_time): ) if args.critic_lr: config["trainer"]["trainer_config"]["critic"]["optim"]["lr"] = args.critic_lr + if args.dataset == "alfworld": + print( + "Warning: The current benchmark script of ALFWorld only supports GRPO; the SFT stage will be supported soon." + ) taskset_config = config["buffer"]["explorer_input"]["taskset"] taskset_config["path"] = check_taskset_path( args.dataset, args.taskset_path or os.environ.get("TASKSET_PATH") or taskset_config["path"], ) + eval_taskset_config = config["buffer"]["explorer_input"]["eval_tasksets"] + if len(eval_taskset_config) > 0: + # TODO: support seperately set path for eval taskset + for eval_taskset_config in eval_taskset_config: + eval_taskset_config["path"] = taskset_config["path"] if args.lr: config["algorithm"]["optimizer"]["lr"] = args.lr if args.sync_interval: @@ -236,7 +263,11 @@ def main(args): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"]) + parser.add_argument( + "dataset", + type=str.lower, + choices=["gsm8k", "countdown", "guru_math", "alfworld", "frozenlake"], + ) parser.add_argument( "--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC." ) diff --git a/benchmark/config/alfworld-template.yaml b/benchmark/config/alfworld-template.yaml new file mode 100644 index 0000000000..edd0626624 --- /dev/null +++ b/benchmark/config/alfworld-template.yaml @@ -0,0 +1,86 @@ +mode: both +project: Trinity-RFT +group: ${oc.env:TRINITY_GROUP,os-bench} +name: ${oc.env:TRINITY_NAME,alfworld} +checkpoint_root_dir: placeholder +algorithm: + algorithm_type: grpo + repeat_times: 8 + loss_agg_mode: "seq-mean-token-sum" + optimizer: + lr: 1e-6 + sample_strategy: warmup + policy_loss_fn: ppo + advantage_fn: grpo + kl_penalty_fn: none + kl_loss_fn: k2 + entropy_loss_fn: default + kl_loss_fn_args: + kl_coef: 0.001 +data_processor: {} +model: + model_path: placeholder + max_prompt_tokens: 10240 + max_response_tokens: 4096 +cluster: + node_num: 1 + gpu_per_node: 8 +buffer: + batch_size: 32 + total_epochs: 5 + explorer_input: + taskset: + name: alfworld + split: train + storage_type: file + path: null + format: + prompt_key: 'game_file' + rollout_args: + temperature: 1.0 + logprobs: 0 + eval_tasksets: + - name: alfworld + split: test + storage_type: file + path: null + format: + prompt_key: 'game_file' + rollout_args: + temperature: 1.0 + logprobs: 0 + default_workflow_type: 'alfworld_workflow' +explorer: + eval_on_startup: true + eval_interval: 10 + runner_per_model: 8 + max_timeout: 3600 + max_retry_times: 2 + rollout_model: + engine_num: 4 + tensor_parallel_size: 1 + enforce_eager: false + enable_prefix_caching: false + enable_chunked_prefill: true + gpu_memory_utilization: 0.7 + dtype: bfloat16 + seed: 42 + enable_thinking: false + enable_openai_api: false + auxiliary_models: [] + bench_on_latest_checkpoint: true +trainer: + trainer_type: verl + save_interval: 1000 + enable_preview: true + grad_clip: 1.0 + use_dynamic_bsz: true + max_token_len_per_gpu: 16384 + ulysses_sequence_parallel_size: 1 +monitor: + monitor_type: wandb +synchronizer: + sync_method: nccl + sync_style: fixed + sync_interval: 1 + sync_timeout: 3600 diff --git a/benchmark/config/frozenlake-template.yaml b/benchmark/config/frozenlake-template.yaml new file mode 100644 index 0000000000..7208b19c76 --- /dev/null +++ b/benchmark/config/frozenlake-template.yaml @@ -0,0 +1,91 @@ +mode: both +project: Trinity-RFT +group: ${oc.env:TRINITY_GROUP,frozenlake-bench} +name: ${oc.env:TRINITY_NAME,frozenlake} +checkpoint_root_dir: placeholder +algorithm: + algorithm_type: grpo + repeat_times: 8 + loss_agg_mode: "seq-mean-token-sum" + optimizer: + lr: 1e-6 + policy_loss_fn: ppo + advantage_fn: grpo + kl_penalty_fn: none + kl_loss_fn: k2 + entropy_loss_fn: default + policy_loss_fn_args: + clip_range_low: 0.2 + clip_range_high: 0.28 + kl_loss_fn_args: + kl_coef: 0.0 +data_processor: {} +model: + model_path: Qwen/Qwen2.5-3B-Instruct + max_prompt_tokens: 4096 + max_response_tokens: 10240 +cluster: + node_num: 1 + gpu_per_node: 8 +buffer: + batch_size: 64 + total_epochs: 3 + explorer_input: + taskset: + name: frozenlake + storage_type: file + path: null + split: train + workflow_args: + env_max_steps: 8 + agent_max_steps: 10 + is_slippery: false + eval_tasksets: + - name: frozenlake + storage_type: file + path: null + split: test + workflow_args: + env_max_steps: 8 + agent_max_steps: 10 + is_slippery: false + repeat_times: 4 + rollout_args: + temperature: 0.7 + top_p: 0.8 + top_k: 20 + default_workflow_type: 'frozen_lake_workflow' +explorer: + eval_on_startup: true + eval_interval: 10 + runner_per_model: 8 + max_timeout: 900 + max_retry_times: 2 + rollout_model: + engine_num: 4 + tensor_parallel_size: 1 + enforce_eager: false + enable_prefix_caching: false + enable_chunked_prefill: true + gpu_memory_utilization: 0.85 + dtype: bfloat16 + seed: 42 + enable_thinking: false + enable_openai_api: false + auxiliary_models: [] + bench_on_latest_checkpoint: true +trainer: + trainer_type: verl + save_interval: 1000 + enable_preview: true + grad_clip: 1.0 + use_dynamic_bsz: true + max_token_len_per_gpu: 16384 + ulysses_sequence_parallel_size: 1 +# monitor: +# monitor_type: wandb +synchronizer: + sync_method: nccl + sync_style: fixed + sync_interval: 1 + sync_timeout: 1200 diff --git a/benchmark/reports/alfworld.md b/benchmark/reports/alfworld.md new file mode 100644 index 0000000000..e6663478f7 --- /dev/null +++ b/benchmark/reports/alfworld.md @@ -0,0 +1,48 @@ +# ALFWorld Benchmark Results + +## 1. Task Introduction + +[ALFWorld](https://github.com/alfworld/alfworld) is a text-based interactive environment where agents need to complete household tasks in a virtual home environment. The agent interacts with the environment through natural language commands to accomplish tasks. + +The environment is configured as follows: +* Environment: Text-based interactive environment built on TextWorld +* Action Space: Commands such as `pick`, `go to`, `place`, etc. +* Reward Structure: +1 for successfully completing the task, -0.1 otherwise +* Maximum Steps: 30 (configurable via `max_env_steps`) + +See the [documentation](https://modelscope.github.io/Trinity-RFT/en/main/tutorial/example_multi_turn.html) for data preparation. + +## 2. Experimental Settings + +We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025. +Since rLLM does not support ALFWorld environment yet, we implement this task in rLLM for comparison. + +In Trinity-RFT and rLLM, we respectively evaluate the performance using GRPO algorithm on this task. +We fine-tune a `Qwen2.5-3B-Instruct` model, which has been trained on a SFT dataset, on the training tasks with GRPO and other methods. For all methods, we fix key parameters to `batch_size=32`, `repeat_times=8`, `lr=1e-6`, and `kl_coef=0.001`. + +For better efficiency, we use 64 rollout workers in rLLM and set the `explorer.engine_num` to 4 and `explorer.runner_per_model` to 8 in Trinity-RFT. + +## 3. Results and Analysis + +We compare the sample efficiency of different methods by plotting the reward and test score vs. training steps. As shown in the following figures, Trinity-RFT and rLLM reach similar training and test results at the same step. + +![](../../docs/sphinx_doc/assets/bench_alfworld_step.png) + +We further compare the efficiency on the ALFWorld task. +The following table details the wall-clock time required for each method to reach the specific performance thresholds, i.e., reward = 0.8 and test score = 0.6. + +| Method | Training Reward | Time to Reach Target (Hours) | Speedup | +|----------|------------------|-------------------------------|---------| +| rLLM | 0.830 | 9.33 | - | +| Trinity-RFT | 0.826 | 2.53 | 3.69× | + + +| Method | Test Score | Time to Reach Target (Hours) | Speedup | +|----------|------------------|-------------------------------|---------| +| rLLM | 0.670 | 6.65 | - | +| Trinity-RFT | 0.632 | 1.14 | 5.83× | + +The results show that the Trinity-RFT achieves a noticeable speedup on the ALFWorld task, also shown in the following figures. +The primary reason for the efficiency lies in the difference between the rollout mechanisms of Trinity-RFT and rLLM. Trinity-RFT uses multiprocessing during rollout, whereas rLLM employs multithreading, which restricts the parallelism of the rollout process in ALFWorld environment given that this environment is not thread-safe (refer to [this issue](https://github.com/alfworld/alfworld/issues/71)). + +![](../../docs/sphinx_doc/assets/bench_alfworld_time.png) diff --git a/benchmark/reports/frozenlake.md b/benchmark/reports/frozenlake.md new file mode 100644 index 0000000000..f9fb864c3c --- /dev/null +++ b/benchmark/reports/frozenlake.md @@ -0,0 +1,48 @@ +# Frozen Lake Benchmark Results + +## 1. Task Introduction + +[Frozen lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) involves walking over a frozen lake from Start (S) to Goal (G) without falling into any Holes (H). We formulate this task as a multi-step workflow, where the agent interacts with the environment for multiple steps to reach the goal. + +The environment is configured as follows: +* Map Size: From 2x2 to 5x5, randomly generated. +* Mode: Non-Slippery +* Action Space: Up, Down, Left, Right +* Reward Structure: +1 for reaching the goal, 0 otherwise. + +The training and test data are generated by the following script: +```bash +python examples/grpo_frozen_lake/get_frozen_lake_data.py +``` +This command generates 10000 training tasks and 100 test tasks. + +To filter the unsolvable tasks, we restrict the game map to have a valid path within `env_map_steps=8` steps. Moreover, the agent can take at most `agent_max_steps=10` steps to reach the goal. + + +## 2. Experimental Settings + +We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025. + +We fine-tune a Qwen2.5-3B-Instruct model using the training tasks with GRPO. For all experiments, we fix key parameters to `batch_size=64`, `repeat_times=8`, and `lr=1e-6`. We run each experiment for three times and report the average results. + +For fair comparison, we optimize the configurations related to the training efficiency to achieve better performance. For rLLM, we adopt the default configurations in ` examples/frozenlake/train_frozenlake_agent.sh` except that we increase the batch size to 64 for stability and set the number of rollout workers to 64 for efficiency. For Trinity-RFT, we set the `explorer.engine_num` to 4 for efficiency. + +## 3. Results and Analysis + +We compare the sample efficiency of different methods by plotting the reward and test score in the following figures. At the same step, Trinity-RFT and rLLM achieve similar rewards and test scores, verifying the training correctness. + +![](../../docs/sphinx_doc/assets/bench_frozenlake_step.png) + +The following table details the wall-clock time required for each method to reach a specific performance threshold. From the results, Trinity-RFT requires less time to reach the target performance, i.e., reward=0.6, reward=0.8, and test score=0.8. + +| Method | Training Reward | Time to Reach Target (Hours) | Speedup | +|----------|------------------|-------------------------------|---------| +| rLLM | 0.6023 | 3.967 | - | +| Trinity-RFT | 0.6188 | 2.87 | 1.38× | +| rLLM | 0.8007 | 5.91 | - | +| Trinity-RFT | 0.8033 | 5.44 | 1.09× | + +| Method | Test Score | Time to Reach Target (Hours) | Speedup | +|----------|------------------|-------------------------------|---------| +| rLLM | 0.8096 | 6.82 | - | +| Trinity-RFT | 0.8262 | 5.15 | 1.32× | diff --git a/benchmark/scripts/get_alfworld_full_data.py b/benchmark/scripts/get_alfworld_full_data.py new file mode 100644 index 0000000000..bc75af7896 --- /dev/null +++ b/benchmark/scripts/get_alfworld_full_data.py @@ -0,0 +1,51 @@ +import argparse +import os +import subprocess +import sys + +DEFAULT_DATA_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "data", "alfworld" +) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=DEFAULT_DATA_PATH) + args = parser.parse_args() + + # Step 1: Get all game files from Huggingface + game_data_dir = os.path.join(args.local_dir, "..", "alfworld_game_data") + if os.path.exists(game_data_dir) and os.path.exists(os.path.join(game_data_dir, "json_2.1.1")): + print(f"Game data directory already exists: {game_data_dir}") + + else: + os.makedirs(game_data_dir, exist_ok=True) + subprocess.run(["pip", "install", "alfworld[full]"], check=True) + # Set environment variable for alfworld-download command + env = os.environ.copy() + env["ALFWORLD_DATA"] = game_data_dir + subprocess.run(["alfworld-download"], check=True, env=env) + + # Step 2: Run the script to get the mapping file + base_dir = os.path.dirname(__file__) + data_prepare_path = os.path.abspath( + os.path.join( + base_dir, + "..", + "..", + "examples", + "grpo_alfworld", + "get_alfworld_data.py", + ) + ) + subprocess.executable( + [ + sys.executable, + data_prepare_path, + "--game_data_path", + game_data_dir, + "--local_dir", + args.local_dir, + ], + check=True, + ) diff --git a/docs/sphinx_doc/assets/bench_alfworld_step.png b/docs/sphinx_doc/assets/bench_alfworld_step.png new file mode 100644 index 0000000000..998fcdc8e2 Binary files /dev/null and b/docs/sphinx_doc/assets/bench_alfworld_step.png differ diff --git a/docs/sphinx_doc/assets/bench_alfworld_time.png b/docs/sphinx_doc/assets/bench_alfworld_time.png new file mode 100644 index 0000000000..502cb40db4 Binary files /dev/null and b/docs/sphinx_doc/assets/bench_alfworld_time.png differ diff --git a/docs/sphinx_doc/assets/bench_frozenlake_step.png b/docs/sphinx_doc/assets/bench_frozenlake_step.png new file mode 100644 index 0000000000..2177133cd9 Binary files /dev/null and b/docs/sphinx_doc/assets/bench_frozenlake_step.png differ diff --git a/examples/grpo_alfworld/get_alfworld_data.py b/examples/grpo_alfworld/get_alfworld_data.py index 9989e8bffa..8cdccbe569 100644 --- a/examples/grpo_alfworld/get_alfworld_data.py +++ b/examples/grpo_alfworld/get_alfworld_data.py @@ -2,6 +2,7 @@ We use this script to create the huggingface format dataset files for the alfworld dataset. NOTE: You need to install the alfworld dataset in first: https://github.com/alfworld/alfworld """ +import argparse import glob import json import os @@ -10,16 +11,13 @@ random.seed(42) -def create_dataset_files(output_dir, train_size=None, test_size=None): - # The ALFWORLD_DATA is the dataset path in the environment variable ALFWORLD_DATA, you need to set it when install alfworld dataset - from alfworld.info import ALFWORLD_DATA - +def create_dataset_files(game_data_path, output_dir, train_size=None, test_size=None): # get all matched game files from train and valid_seen directories train_game_files = glob.glob( - os.path.expanduser(f"{ALFWORLD_DATA}/json_2.1.1/train/*/*/game.tw-pddl") + os.path.expanduser(f"{game_data_path}/json_2.1.1/train/*/*/game.tw-pddl") ) test_game_files = glob.glob( - os.path.expanduser(f"{ALFWORLD_DATA}/json_2.1.1/valid_seen/*/*/game.tw-pddl") + os.path.expanduser(f"{game_data_path}/json_2.1.1/valid_seen/*/*/game.tw-pddl") ) # get absolute path @@ -86,8 +84,28 @@ def create_dataset_files(output_dir, train_size=None, test_size=None): if __name__ == "__main__": - current_file_dir = os.path.dirname(os.path.abspath(__file__)) - output_dir = f"{current_file_dir}/alfworld_data" + parser = argparse.ArgumentParser() + parser.add_argument("--game_data_path", type=str, default=None, required=False) + parser.add_argument("--local_dir", type=str, default=None, required=False) + parser.add_argument("--train_size", type=int, default=None, required=False) + parser.add_argument("--test_size", type=int, default=None, required=False) + args = parser.parse_args() + + if args.game_data_path is None: + # ALFWORLD_DATA is the dataset path in the environment variable + # you need to set it when install alfworld dataset + from alfworld.info import ALFWORLD_DATA + + args.game_data_path = ALFWORLD_DATA + + if args.local_dir is None: + current_file_dir = os.path.dirname(os.path.abspath(__file__)) + args.local_dir = f"{current_file_dir}/alfworld_data" + # use all data by default, or specify train_size and test_size if needed - create_dataset_files(output_dir) - # create_dataset_files(output_dir, train_size=1024, test_size=100) # use subset of data for testing + create_dataset_files( + game_data_path=args.game_data_path, + output_dir=args.local_dir, + train_size=args.train_size, + test_size=args.test_size, + ) diff --git a/examples/grpo_frozen_lake/get_frozen_lake_data.py b/examples/grpo_frozen_lake/get_frozen_lake_data.py index 17b4aae87b..65c7e4102d 100644 --- a/examples/grpo_frozen_lake/get_frozen_lake_data.py +++ b/examples/grpo_frozen_lake/get_frozen_lake_data.py @@ -1,6 +1,7 @@ """ Modified from https://github.com/rllm-org/rllm/blob/main/examples/frozenlake/prepare_frozenlake_data.py """ +import argparse import os import numpy as np @@ -8,46 +9,45 @@ from trinity.common.constants import TASKSET_PATH_ENV_VAR -path_from_env = os.environ.get(TASKSET_PATH_ENV_VAR) -if path_from_env is not None: - DATA_ROOT_DIR = os.path.dirname(path_from_env) -else: - DATA_ROOT_DIR = os.path.join(os.path.dirname(__file__), "data") +DEFAULT_DATA_PATH = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "..", "data", "frozenlake" +) -def save_dataset_to_local(name: str, data: list[dict], split: str = "default") -> str: - """Save dataset directly to local DATA_PATH. +def save_dataset_to_local(data_path: str, data: list[dict], split: str = "default") -> str: + """Save dataset directly to local data_path. Args: - name: Name of the dataset + data_path: Path to save the dataset data: List of dictionaries containing the dataset examples split: Split name (e.g., 'train', 'test', 'default') Returns: str: Path to the saved parquet file """ - dataset_dir = os.path.join(DATA_ROOT_DIR, name) - os.makedirs(dataset_dir, exist_ok=True) + os.makedirs(data_path, exist_ok=True) # Convert to DataFrame and save data_df = pd.DataFrame(data) - dataset_path = os.path.join(dataset_dir, f"{split}.parquet") + dataset_path = os.path.join(data_path, f"{split}.parquet") data_df.to_parquet(dataset_path) print( - f"Saved dataset '{name}' split '{split}' with {len(data)} examples at {dataset_path}. Make sure to set the environment variable {TASKSET_PATH_ENV_VAR} to {DATA_ROOT_DIR}/{name}." + f"Saved dataset frozenlake split '{split}' with {len(data)} examples at {dataset_path}. Make sure to set the environment variable {TASKSET_PATH_ENV_VAR} to {data_path}." ) return dataset_path -def prepare_frozenlake_data(train_size=10000, test_size=100, map_max_size=6): +def prepare_frozenlake_data(data_path, train_size=10000, test_size=100, map_max_size=6): """ Prepare and save FrozenLake datasets for training and testing. Args: + data_path (str): Path to save the dataset train_size (int): Number of training examples to generate test_size (int): Number of test examples to generate + map_max_size (int): Maximum size of the map Returns: tuple: (train_data, test_data) - Lists of data dictionaries @@ -78,14 +78,27 @@ def frozenlake_process_fn(seed, size, p, idx): ] # Save datasets directly to local DATA_PATH - save_dataset_to_local("frozenlake", train_data, "train") - save_dataset_to_local("frozenlake", test_data, "test") + save_dataset_to_local(data_path, train_data, "train") + save_dataset_to_local(data_path, test_data, "test") return train_data, test_data if __name__ == "__main__": - train_data, test_data = prepare_frozenlake_data() + parser = argparse.ArgumentParser() + parser.add_argument("--local_dir", default=DEFAULT_DATA_PATH) + parser.add_argument("--train_size", type=int, default=10000) + parser.add_argument("--test_size", type=int, default=100) + parser.add_argument("--map_max_size", type=int, default=6) + args = parser.parse_args() + + train_data, test_data = prepare_frozenlake_data( + data_path=args.local_dir, + train_size=args.train_size, + test_size=args.test_size, + map_max_size=args.map_max_size, + ) + print(f"Train dataset: {len(train_data)} examples") print(f"Test dataset: {len(test_data)} examples") print("Sample train example:", train_data[0])