modelscope
diff --git a/‎benchmark/bench.py‎
Lines changed: 33 additions & 2 deletions b/‎benchmark/bench.py‎
Lines changed: 33 additions & 2 deletions
diff --git a/‎benchmark/config/alfworld-template.yaml‎
Lines changed: 86 additions & 0 deletions b/‎benchmark/config/alfworld-template.yaml‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎benchmark/config/frozenlake-template.yaml‎
Lines changed: 91 additions & 0 deletions b/‎benchmark/config/frozenlake-template.yaml‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎benchmark/reports/alfworld.md‎
Lines changed: 48 additions & 0 deletions b/‎benchmark/reports/alfworld.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎benchmark/reports/frozenlake.md‎
Lines changed: 48 additions & 0 deletions b/‎benchmark/reports/frozenlake.md‎
Lines changed: 48 additions & 0 deletions
@@ -105,16 +105,27 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
         if dataset_name == "gsm8k" and taskset_path == "openai/gsm8k":
             return taskset_path
 
+    base_dir = os.path.dirname(__file__)
+    frozenlake_data_script_path = os.path.abspath(
+        os.path.join(
+            base_dir,
+            "..",
+            "examples",
+            "grpo_frozen_lake",
+            "get_frozen_lake_data.py",
+        )
+    )
     dataset_script_map = {
         "countdown": "gen_countdown_data.py",
         "guru_math": "gen_guru_math_data.py",
+        "alfworld": "get_alfworld_full_data.py",
+        "frozenlake": frozenlake_data_script_path,
     }
     if dataset_name not in dataset_script_map:
         raise ValueError(
             f"Unsupported dataset: {dataset_name}. Please specify a valid taskset path."
         )
 
-    base_dir = os.path.dirname(__file__)
     script_filename = dataset_script_map[dataset_name]
     script_module_name = script_filename[:-3]  # remove .py
 
@@ -134,6 +145,13 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
         taskset_path = module.DEFAULT_DATA_PATH
     taskset_path = os.path.realpath(taskset_path)
 
+    # For frozenlake, check if train.parquet and test.parquet already exist
+    if dataset_name == "frozenlake":
+        train_path = os.path.join(taskset_path, "train.parquet")
+        test_path = os.path.join(taskset_path, "test.parquet")
+        if os.path.exists(train_path) and os.path.exists(test_path):
+            return taskset_path
+
     gen_script_path = os.path.join(base_dir, "scripts", script_filename)
     subprocess.run([sys.executable, gen_script_path, "--local_dir", taskset_path], check=True)
 
@@ -168,11 +186,20 @@ def prepare_configs(args, rank, current_time):
             )
             if args.critic_lr:
                 config["trainer"]["trainer_config"]["critic"]["optim"]["lr"] = args.critic_lr
+        if args.dataset == "alfworld":
+            print(
+                "Warning: The current benchmark script of ALFWorld only supports GRPO; the SFT stage will be supported soon."
+            )
         taskset_config = config["buffer"]["explorer_input"]["taskset"]
         taskset_config["path"] = check_taskset_path(
             args.dataset,
             args.taskset_path or os.environ.get("TASKSET_PATH") or taskset_config["path"],
         )
+        eval_taskset_config = config["buffer"]["explorer_input"]["eval_tasksets"]
+        if len(eval_taskset_config) > 0:
+            # TODO: support seperately set path for eval taskset
+            for eval_taskset_config in eval_taskset_config:
+                eval_taskset_config["path"] = taskset_config["path"]
         if args.lr:
             config["algorithm"]["optimizer"]["lr"] = args.lr
         if args.sync_interval:
@@ -236,7 +263,11 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"])
+    parser.add_argument(
+        "dataset",
+        type=str.lower,
+        choices=["gsm8k", "countdown", "guru_math", "alfworld", "frozenlake"],
+    )
     parser.add_argument(
         "--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC."
     )
 
@@ -0,0 +1,86 @@
+mode: both
+project: Trinity-RFT
+group: ${oc.env:TRINITY_GROUP,os-bench}
+name: ${oc.env:TRINITY_NAME,alfworld}
+checkpoint_root_dir: placeholder
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+  loss_agg_mode: "seq-mean-token-sum"
+  optimizer:
+    lr: 1e-6
+  sample_strategy: warmup
+  policy_loss_fn: ppo
+  advantage_fn: grpo
+  kl_penalty_fn: none
+  kl_loss_fn: k2
+  entropy_loss_fn: default
+  kl_loss_fn_args:
+    kl_coef: 0.001
+data_processor: {}
+model:
+  model_path: placeholder
+  max_prompt_tokens: 10240
+  max_response_tokens: 4096
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  batch_size: 32
+  total_epochs: 5
+  explorer_input:
+    taskset:
+      name: alfworld
+      split: train
+      storage_type: file
+      path: null
+      format:
+        prompt_key: 'game_file'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 0
+    eval_tasksets:
+      - name: alfworld
+        split: test
+        storage_type: file
+        path: null
+        format:
+          prompt_key: 'game_file'
+        rollout_args:
+          temperature: 1.0
+          logprobs: 0
+    default_workflow_type: 'alfworld_workflow'
+explorer:
+  eval_on_startup: true
+  eval_interval: 10
+  runner_per_model: 8
+  max_timeout: 3600
+  max_retry_times: 2
+  rollout_model:
+    engine_num: 4
+    tensor_parallel_size: 1
+    enforce_eager: false
+    enable_prefix_caching: false
+    enable_chunked_prefill: true
+    gpu_memory_utilization: 0.7
+    dtype: bfloat16
+    seed: 42
+    enable_thinking: false
+    enable_openai_api: false
+  auxiliary_models: []
+  bench_on_latest_checkpoint: true
+trainer:
+  trainer_type: verl
+  save_interval: 1000
+  enable_preview: true
+  grad_clip: 1.0
+  use_dynamic_bsz: true
+  max_token_len_per_gpu: 16384
+  ulysses_sequence_parallel_size: 1
+monitor:
+  monitor_type: wandb
+synchronizer:
+  sync_method: nccl
+  sync_style: fixed
+  sync_interval: 1
+  sync_timeout: 3600
@@ -0,0 +1,91 @@
+mode: both
+project: Trinity-RFT
+group: ${oc.env:TRINITY_GROUP,frozenlake-bench}
+name: ${oc.env:TRINITY_NAME,frozenlake}
+checkpoint_root_dir: placeholder
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+  loss_agg_mode: "seq-mean-token-sum"
+  optimizer:
+    lr: 1e-6
+  policy_loss_fn: ppo
+  advantage_fn: grpo
+  kl_penalty_fn: none
+  kl_loss_fn: k2
+  entropy_loss_fn: default
+  policy_loss_fn_args:
+    clip_range_low: 0.2
+    clip_range_high: 0.28
+  kl_loss_fn_args:
+    kl_coef: 0.0
+data_processor: {}
+model:
+  model_path: Qwen/Qwen2.5-3B-Instruct
+  max_prompt_tokens: 4096
+  max_response_tokens: 10240
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  batch_size: 64
+  total_epochs: 3
+  explorer_input:
+    taskset:
+      name: frozenlake
+      storage_type: file
+      path: null
+      split: train
+      workflow_args:
+        env_max_steps: 8
+        agent_max_steps: 10
+        is_slippery: false
+    eval_tasksets:
+      - name: frozenlake
+        storage_type: file
+        path: null
+        split: test
+        workflow_args:
+          env_max_steps: 8
+          agent_max_steps: 10
+          is_slippery: false
+        repeat_times: 4
+        rollout_args:
+          temperature: 0.7
+          top_p: 0.8
+          top_k: 20
+    default_workflow_type: 'frozen_lake_workflow'
+explorer:
+  eval_on_startup: true
+  eval_interval: 10
+  runner_per_model: 8
+  max_timeout: 900
+  max_retry_times: 2
+  rollout_model:
+    engine_num: 4
+    tensor_parallel_size: 1
+    enforce_eager: false
+    enable_prefix_caching: false
+    enable_chunked_prefill: true
+    gpu_memory_utilization: 0.85
+    dtype: bfloat16
+    seed: 42
+    enable_thinking: false
+    enable_openai_api: false
+  auxiliary_models: []
+  bench_on_latest_checkpoint: true
+trainer:
+  trainer_type: verl
+  save_interval: 1000
+  enable_preview: true
+  grad_clip: 1.0
+  use_dynamic_bsz: true
+  max_token_len_per_gpu: 16384
+  ulysses_sequence_parallel_size: 1
+# monitor:
+#   monitor_type: wandb
+synchronizer:
+  sync_method: nccl
+  sync_style: fixed
+  sync_interval: 1
+  sync_timeout: 1200
@@ -0,0 +1,48 @@
+# ALFWorld Benchmark Results
+
+## 1. Task Introduction
+
+[ALFWorld](https://github.com/alfworld/alfworld) is a text-based interactive environment where agents need to complete household tasks in a virtual home environment. The agent interacts with the environment through natural language commands to accomplish tasks.
+
+The environment is configured as follows:
+* Environment: Text-based interactive environment built on TextWorld
+* Action Space: Commands such as `pick`, `go to`, `place`, etc.
+* Reward Structure: +1 for successfully completing the task, -0.1 otherwise
+* Maximum Steps: 30 (configurable via `max_env_steps`)
+
+See the [documentation](https://modelscope.github.io/Trinity-RFT/en/main/tutorial/example_multi_turn.html) for data preparation.
+
+## 2. Experimental Settings
+
+We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
+Since rLLM does not support ALFWorld environment yet, we implement this task in rLLM for comparison.
+
+In Trinity-RFT and rLLM, we respectively evaluate the performance using GRPO algorithm on this task.
+We fine-tune a `Qwen2.5-3B-Instruct` model, which has been trained on a SFT dataset, on the training tasks with GRPO and other methods. For all methods, we fix key parameters to `batch_size=32`, `repeat_times=8`, `lr=1e-6`, and `kl_coef=0.001`.
+
+For better efficiency, we use 64 rollout workers in rLLM and set the `explorer.engine_num` to 4 and `explorer.runner_per_model` to 8 in Trinity-RFT.
+
+## 3. Results and Analysis
+
+We compare the sample efficiency of different methods by plotting the reward and test score vs. training steps. As shown in the following figures, Trinity-RFT and rLLM reach similar training and test results at the same step.
+
+![](../../docs/sphinx_doc/assets/bench_alfworld_step.png)
+
+We further compare the efficiency on the ALFWorld task.
+The following table details the wall-clock time required for each method to reach the specific performance thresholds, i.e., reward = 0.8 and test score = 0.6.
+
+| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.830 | 9.33 | - |
+| Trinity-RFT | 0.826 | 2.53 | 3.69× |
+
+
+| Method | Test Score | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.670 | 6.65 | - |
+| Trinity-RFT | 0.632 | 1.14 | 5.83× |
+
+The results show that the Trinity-RFT achieves a noticeable speedup on the ALFWorld task, also shown in the following figures.
+The primary reason for the efficiency lies in the difference between the rollout mechanisms of Trinity-RFT and rLLM. Trinity-RFT uses multiprocessing during rollout, whereas rLLM employs multithreading, which restricts the parallelism of the rollout process in ALFWorld environment given that this environment is not thread-safe (refer to [this issue](https://github.com/alfworld/alfworld/issues/71)).
+
+![](../../docs/sphinx_doc/assets/bench_alfworld_time.png)
@@ -0,0 +1,48 @@
+# Frozen Lake Benchmark Results
+
+## 1. Task Introduction
+
+[Frozen lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) involves walking over a frozen lake from Start (S) to Goal (G) without falling into any Holes (H). We formulate this task as a multi-step workflow, where the agent interacts with the environment for multiple steps to reach the goal.
+
+The environment is configured as follows:
+* Map Size: From 2x2 to 5x5, randomly generated.
+* Mode: Non-Slippery
+* Action Space: Up, Down, Left, Right
+* Reward Structure: +1 for reaching the goal, 0 otherwise.
+
+The training and test data are generated by the following script:
+```bash
+python examples/grpo_frozen_lake/get_frozen_lake_data.py
+```
+This command generates 10000 training tasks and 100 test tasks.
+
+To filter the unsolvable tasks, we restrict the game map to have a valid path within `env_map_steps=8` steps. Moreover, the agent can take at most `agent_max_steps=10` steps to reach the goal.
+
+
+## 2. Experimental Settings
+
+We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
+
+We fine-tune a Qwen2.5-3B-Instruct model using the training tasks with GRPO. For all experiments, we fix key parameters to `batch_size=64`, `repeat_times=8`, and `lr=1e-6`. We run each experiment for three times and report the average results.
+
+For fair comparison, we optimize the configurations related to the training efficiency to achieve better performance. For rLLM, we adopt the default configurations in ` examples/frozenlake/train_frozenlake_agent.sh` except that we increase the batch size to 64 for stability and set the number of rollout workers to 64 for efficiency. For Trinity-RFT, we set the `explorer.engine_num` to 4 for efficiency.
+
+## 3. Results and Analysis
+
+We compare the sample efficiency of different methods by plotting the reward and test score in the following figures. At the same step, Trinity-RFT and rLLM achieve similar rewards and test scores, verifying the training correctness.
+
+![](../../docs/sphinx_doc/assets/bench_frozenlake_step.png)
+
+The following table details the wall-clock time required for each method to reach a specific performance threshold. From the results, Trinity-RFT requires less time to reach the target performance, i.e., reward=0.6, reward=0.8, and test score=0.8.
+
+| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.6023 | 3.967 | - |
+| Trinity-RFT | 0.6188 | 2.87 | 1.38× |
+| rLLM | 0.8007 | 5.91 | - |
+| Trinity-RFT | 0.8033 | 5.44 | 1.09× |
+
+| Method | Test Score | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.8096 | 6.82 | - |
+| Trinity-RFT | 0.8262 | 5.15 | 1.32× |