diff --git a/benchmark/bench.py b/benchmark/bench.py
index 6adf062d9d..ac336d904a 100644
--- a/benchmark/bench.py
+++ b/benchmark/bench.py
@@ -105,16 +105,27 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
         if dataset_name == "gsm8k" and taskset_path == "openai/gsm8k":
             return taskset_path
 
+    base_dir = os.path.dirname(__file__)
+    frozenlake_data_script_path = os.path.abspath(
+        os.path.join(
+            base_dir,
+            "..",
+            "examples",
+            "grpo_frozen_lake",
+            "get_frozen_lake_data.py",
+        )
+    )
     dataset_script_map = {
         "countdown": "gen_countdown_data.py",
         "guru_math": "gen_guru_math_data.py",
+        "alfworld": "get_alfworld_full_data.py",
+        "frozenlake": frozenlake_data_script_path,
     }
     if dataset_name not in dataset_script_map:
         raise ValueError(
             f"Unsupported dataset: {dataset_name}. Please specify a valid taskset path."
         )
 
-    base_dir = os.path.dirname(__file__)
     script_filename = dataset_script_map[dataset_name]
     script_module_name = script_filename[:-3]  # remove .py
 
@@ -134,6 +145,13 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
         taskset_path = module.DEFAULT_DATA_PATH
     taskset_path = os.path.realpath(taskset_path)
 
+    # For frozenlake, check if train.parquet and test.parquet already exist
+    if dataset_name == "frozenlake":
+        train_path = os.path.join(taskset_path, "train.parquet")
+        test_path = os.path.join(taskset_path, "test.parquet")
+        if os.path.exists(train_path) and os.path.exists(test_path):
+            return taskset_path
+
     gen_script_path = os.path.join(base_dir, "scripts", script_filename)
     subprocess.run([sys.executable, gen_script_path, "--local_dir", taskset_path], check=True)
 
@@ -168,11 +186,20 @@ def prepare_configs(args, rank, current_time):
             )
             if args.critic_lr:
                 config["trainer"]["trainer_config"]["critic"]["optim"]["lr"] = args.critic_lr
+        if args.dataset == "alfworld":
+            print(
+                "Warning: The current benchmark script of ALFWorld only supports GRPO; the SFT stage will be supported soon."
+            )
         taskset_config = config["buffer"]["explorer_input"]["taskset"]
         taskset_config["path"] = check_taskset_path(
             args.dataset,
             args.taskset_path or os.environ.get("TASKSET_PATH") or taskset_config["path"],
         )
+        eval_taskset_config = config["buffer"]["explorer_input"]["eval_tasksets"]
+        if len(eval_taskset_config) > 0:
+            # TODO: support seperately set path for eval taskset
+            for eval_taskset_config in eval_taskset_config:
+                eval_taskset_config["path"] = taskset_config["path"]
         if args.lr:
             config["algorithm"]["optimizer"]["lr"] = args.lr
         if args.sync_interval:
@@ -236,7 +263,11 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"])
+    parser.add_argument(
+        "dataset",
+        type=str.lower,
+        choices=["gsm8k", "countdown", "guru_math", "alfworld", "frozenlake"],
+    )
     parser.add_argument(
         "--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC."
     )
diff --git a/benchmark/config/alfworld-template.yaml b/benchmark/config/alfworld-template.yaml
new file mode 100644
index 0000000000..edd0626624
--- /dev/null
+++ b/benchmark/config/alfworld-template.yaml
@@ -0,0 +1,86 @@
+mode: both
+project: Trinity-RFT
+group: ${oc.env:TRINITY_GROUP,os-bench}
+name: ${oc.env:TRINITY_NAME,alfworld}
+checkpoint_root_dir: placeholder
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+  loss_agg_mode: "seq-mean-token-sum"
+  optimizer:
+    lr: 1e-6
+  sample_strategy: warmup
+  policy_loss_fn: ppo
+  advantage_fn: grpo
+  kl_penalty_fn: none
+  kl_loss_fn: k2
+  entropy_loss_fn: default
+  kl_loss_fn_args:
+    kl_coef: 0.001
+data_processor: {}
+model:
+  model_path: placeholder
+  max_prompt_tokens: 10240
+  max_response_tokens: 4096
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  batch_size: 32
+  total_epochs: 5
+  explorer_input:
+    taskset:
+      name: alfworld
+      split: train
+      storage_type: file
+      path: null
+      format:
+        prompt_key: 'game_file'
+      rollout_args:
+        temperature: 1.0
+        logprobs: 0
+    eval_tasksets:
+      - name: alfworld
+        split: test
+        storage_type: file
+        path: null
+        format:
+          prompt_key: 'game_file'
+        rollout_args:
+          temperature: 1.0
+          logprobs: 0
+    default_workflow_type: 'alfworld_workflow'
+explorer:
+  eval_on_startup: true
+  eval_interval: 10
+  runner_per_model: 8
+  max_timeout: 3600
+  max_retry_times: 2
+  rollout_model:
+    engine_num: 4
+    tensor_parallel_size: 1
+    enforce_eager: false
+    enable_prefix_caching: false
+    enable_chunked_prefill: true
+    gpu_memory_utilization: 0.7
+    dtype: bfloat16
+    seed: 42
+    enable_thinking: false
+    enable_openai_api: false
+  auxiliary_models: []
+  bench_on_latest_checkpoint: true
+trainer:
+  trainer_type: verl
+  save_interval: 1000
+  enable_preview: true
+  grad_clip: 1.0
+  use_dynamic_bsz: true
+  max_token_len_per_gpu: 16384
+  ulysses_sequence_parallel_size: 1
+monitor:
+  monitor_type: wandb
+synchronizer:
+  sync_method: nccl
+  sync_style: fixed
+  sync_interval: 1
+  sync_timeout: 3600
diff --git a/benchmark/config/frozenlake-template.yaml b/benchmark/config/frozenlake-template.yaml
new file mode 100644
index 0000000000..7208b19c76
--- /dev/null
+++ b/benchmark/config/frozenlake-template.yaml
@@ -0,0 +1,91 @@
+mode: both
+project: Trinity-RFT
+group: ${oc.env:TRINITY_GROUP,frozenlake-bench}
+name: ${oc.env:TRINITY_NAME,frozenlake}
+checkpoint_root_dir: placeholder
+algorithm:
+  algorithm_type: grpo
+  repeat_times: 8
+  loss_agg_mode: "seq-mean-token-sum"
+  optimizer:
+    lr: 1e-6
+  policy_loss_fn: ppo
+  advantage_fn: grpo
+  kl_penalty_fn: none
+  kl_loss_fn: k2
+  entropy_loss_fn: default
+  policy_loss_fn_args:
+    clip_range_low: 0.2
+    clip_range_high: 0.28
+  kl_loss_fn_args:
+    kl_coef: 0.0
+data_processor: {}
+model:
+  model_path: Qwen/Qwen2.5-3B-Instruct
+  max_prompt_tokens: 4096
+  max_response_tokens: 10240
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  batch_size: 64
+  total_epochs: 3
+  explorer_input:
+    taskset:
+      name: frozenlake
+      storage_type: file
+      path: null
+      split: train
+      workflow_args:
+        env_max_steps: 8
+        agent_max_steps: 10
+        is_slippery: false
+    eval_tasksets:
+      - name: frozenlake
+        storage_type: file
+        path: null
+        split: test
+        workflow_args:
+          env_max_steps: 8
+          agent_max_steps: 10
+          is_slippery: false
+        repeat_times: 4
+        rollout_args:
+          temperature: 0.7
+          top_p: 0.8
+          top_k: 20
+    default_workflow_type: 'frozen_lake_workflow'
+explorer:
+  eval_on_startup: true
+  eval_interval: 10
+  runner_per_model: 8
+  max_timeout: 900
+  max_retry_times: 2
+  rollout_model:
+    engine_num: 4
+    tensor_parallel_size: 1
+    enforce_eager: false
+    enable_prefix_caching: false
+    enable_chunked_prefill: true
+    gpu_memory_utilization: 0.85
+    dtype: bfloat16
+    seed: 42
+    enable_thinking: false
+    enable_openai_api: false
+  auxiliary_models: []
+  bench_on_latest_checkpoint: true
+trainer:
+  trainer_type: verl
+  save_interval: 1000
+  enable_preview: true
+  grad_clip: 1.0
+  use_dynamic_bsz: true
+  max_token_len_per_gpu: 16384
+  ulysses_sequence_parallel_size: 1
+# monitor:
+#   monitor_type: wandb
+synchronizer:
+  sync_method: nccl
+  sync_style: fixed
+  sync_interval: 1
+  sync_timeout: 1200
diff --git a/benchmark/reports/alfworld.md b/benchmark/reports/alfworld.md
new file mode 100644
index 0000000000..e6663478f7
--- /dev/null
+++ b/benchmark/reports/alfworld.md
@@ -0,0 +1,48 @@
+# ALFWorld Benchmark Results
+
+## 1. Task Introduction
+
+[ALFWorld](https://github.com/alfworld/alfworld) is a text-based interactive environment where agents need to complete household tasks in a virtual home environment. The agent interacts with the environment through natural language commands to accomplish tasks.
+
+The environment is configured as follows:
+* Environment: Text-based interactive environment built on TextWorld
+* Action Space: Commands such as `pick`, `go to`, `place`, etc.
+* Reward Structure: +1 for successfully completing the task, -0.1 otherwise
+* Maximum Steps: 30 (configurable via `max_env_steps`)
+
+See the [documentation](https://modelscope.github.io/Trinity-RFT/en/main/tutorial/example_multi_turn.html) for data preparation.
+
+## 2. Experimental Settings
+
+We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
+Since rLLM does not support ALFWorld environment yet, we implement this task in rLLM for comparison.
+
+In Trinity-RFT and rLLM, we respectively evaluate the performance using GRPO algorithm on this task.
+We fine-tune a `Qwen2.5-3B-Instruct` model, which has been trained on a SFT dataset, on the training tasks with GRPO and other methods. For all methods, we fix key parameters to `batch_size=32`, `repeat_times=8`, `lr=1e-6`, and `kl_coef=0.001`.
+
+For better efficiency, we use 64 rollout workers in rLLM and set the `explorer.engine_num` to 4 and `explorer.runner_per_model` to 8 in Trinity-RFT.
+
+## 3. Results and Analysis
+
+We compare the sample efficiency of different methods by plotting the reward and test score vs. training steps. As shown in the following figures, Trinity-RFT and rLLM reach similar training and test results at the same step.
+
+![](../../docs/sphinx_doc/assets/bench_alfworld_step.png)
+
+We further compare the efficiency on the ALFWorld task.
+The following table details the wall-clock time required for each method to reach the specific performance thresholds, i.e., reward = 0.8 and test score = 0.6.
+
+| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.830 | 9.33 | - |
+| Trinity-RFT | 0.826 | 2.53 | 3.69× |
+
+
+| Method | Test Score | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.670 | 6.65 | - |
+| Trinity-RFT | 0.632 | 1.14 | 5.83× |
+
+The results show that the Trinity-RFT achieves a noticeable speedup on the ALFWorld task, also shown in the following figures.
+The primary reason for the efficiency lies in the difference between the rollout mechanisms of Trinity-RFT and rLLM. Trinity-RFT uses multiprocessing during rollout, whereas rLLM employs multithreading, which restricts the parallelism of the rollout process in ALFWorld environment given that this environment is not thread-safe (refer to [this issue](https://github.com/alfworld/alfworld/issues/71)).
+
+![](../../docs/sphinx_doc/assets/bench_alfworld_time.png)
diff --git a/benchmark/reports/frozenlake.md b/benchmark/reports/frozenlake.md
new file mode 100644
index 0000000000..f9fb864c3c
--- /dev/null
+++ b/benchmark/reports/frozenlake.md
@@ -0,0 +1,48 @@
+# Frozen Lake Benchmark Results
+
+## 1. Task Introduction
+
+[Frozen lake](https://gymnasium.farama.org/environments/toy_text/frozen_lake/) involves walking over a frozen lake from Start (S) to Goal (G) without falling into any Holes (H). We formulate this task as a multi-step workflow, where the agent interacts with the environment for multiple steps to reach the goal.
+
+The environment is configured as follows:
+* Map Size: From 2x2 to 5x5, randomly generated.
+* Mode: Non-Slippery
+* Action Space: Up, Down, Left, Right
+* Reward Structure: +1 for reaching the goal, 0 otherwise.
+
+The training and test data are generated by the following script:
+```bash
+python examples/grpo_frozen_lake/get_frozen_lake_data.py
+```
+This command generates 10000 training tasks and 100 test tasks.
+
+To filter the unsolvable tasks, we restrict the game map to have a valid path within `env_map_steps=8` steps. Moreover, the agent can take at most `agent_max_steps=10` steps to reach the goal.
+
+
+## 2. Experimental Settings
+
+We evaluate the performance of the following methods in Trinity-RFT framework with version [0.3.3](https://github.com/modelscope/Trinity-RFT/releases/tag/v0.3.3) (verl==0.5.0, vllm==0.11.0) and compare against the latest release of rLLM with commit ID [ef6451f](https://github.com/rllm-org/rllm/commit/ef6451fbd7eba224c4a87e3fd944d7c0e2bcc0ea) (verl==0.5.0) as of Nov. 6, 2025.
+
+We fine-tune a Qwen2.5-3B-Instruct model using the training tasks with GRPO. For all experiments, we fix key parameters to `batch_size=64`, `repeat_times=8`, and `lr=1e-6`. We run each experiment for three times and report the average results.
+
+For fair comparison, we optimize the configurations related to the training efficiency to achieve better performance. For rLLM, we adopt the default configurations in ` examples/frozenlake/train_frozenlake_agent.sh` except that we increase the batch size to 64 for stability and set the number of rollout workers to 64 for efficiency. For Trinity-RFT, we set the `explorer.engine_num` to 4 for efficiency.
+
+## 3. Results and Analysis
+
+We compare the sample efficiency of different methods by plotting the reward and test score in the following figures. At the same step, Trinity-RFT and rLLM achieve similar rewards and test scores, verifying the training correctness.
+
+![](../../docs/sphinx_doc/assets/bench_frozenlake_step.png)
+
+The following table details the wall-clock time required for each method to reach a specific performance threshold. From the results, Trinity-RFT requires less time to reach the target performance, i.e., reward=0.6, reward=0.8, and test score=0.8.
+
+| Method | Training Reward | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.6023 | 3.967 | - |
+| Trinity-RFT | 0.6188 | 2.87 | 1.38× |
+| rLLM | 0.8007 | 5.91 | - |
+| Trinity-RFT | 0.8033 | 5.44 | 1.09× |
+
+| Method | Test Score | Time to Reach Target (Hours) | Speedup |
+|----------|------------------|-------------------------------|---------|
+| rLLM | 0.8096 | 6.82 | - |
+| Trinity-RFT | 0.8262 | 5.15 | 1.32× |
diff --git a/benchmark/scripts/get_alfworld_full_data.py b/benchmark/scripts/get_alfworld_full_data.py
new file mode 100644
index 0000000000..bc75af7896
--- /dev/null
+++ b/benchmark/scripts/get_alfworld_full_data.py
@@ -0,0 +1,51 @@
+import argparse
+import os
+import subprocess
+import sys
+
+DEFAULT_DATA_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "..", "data", "alfworld"
+)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=DEFAULT_DATA_PATH)
+    args = parser.parse_args()
+
+    # Step 1: Get all game files from Huggingface
+    game_data_dir = os.path.join(args.local_dir, "..", "alfworld_game_data")
+    if os.path.exists(game_data_dir) and os.path.exists(os.path.join(game_data_dir, "json_2.1.1")):
+        print(f"Game data directory already exists: {game_data_dir}")
+
+    else:
+        os.makedirs(game_data_dir, exist_ok=True)
+        subprocess.run(["pip", "install", "alfworld[full]"], check=True)
+        # Set environment variable for alfworld-download command
+        env = os.environ.copy()
+        env["ALFWORLD_DATA"] = game_data_dir
+        subprocess.run(["alfworld-download"], check=True, env=env)
+
+    # Step 2: Run the script to get the mapping file
+    base_dir = os.path.dirname(__file__)
+    data_prepare_path = os.path.abspath(
+        os.path.join(
+            base_dir,
+            "..",
+            "..",
+            "examples",
+            "grpo_alfworld",
+            "get_alfworld_data.py",
+        )
+    )
+    subprocess.executable(
+        [
+            sys.executable,
+            data_prepare_path,
+            "--game_data_path",
+            game_data_dir,
+            "--local_dir",
+            args.local_dir,
+        ],
+        check=True,
+    )
diff --git a/docs/sphinx_doc/assets/bench_alfworld_step.png b/docs/sphinx_doc/assets/bench_alfworld_step.png
new file mode 100644
index 0000000000..998fcdc8e2
Binary files /dev/null and b/docs/sphinx_doc/assets/bench_alfworld_step.png differ
diff --git a/docs/sphinx_doc/assets/bench_alfworld_time.png b/docs/sphinx_doc/assets/bench_alfworld_time.png
new file mode 100644
index 0000000000..502cb40db4
Binary files /dev/null and b/docs/sphinx_doc/assets/bench_alfworld_time.png differ
diff --git a/docs/sphinx_doc/assets/bench_frozenlake_step.png b/docs/sphinx_doc/assets/bench_frozenlake_step.png
new file mode 100644
index 0000000000..2177133cd9
Binary files /dev/null and b/docs/sphinx_doc/assets/bench_frozenlake_step.png differ
diff --git a/examples/grpo_alfworld/get_alfworld_data.py b/examples/grpo_alfworld/get_alfworld_data.py
index 9989e8bffa..8cdccbe569 100644
--- a/examples/grpo_alfworld/get_alfworld_data.py
+++ b/examples/grpo_alfworld/get_alfworld_data.py
@@ -2,6 +2,7 @@
 We use this script to create the huggingface format dataset files for the alfworld dataset.
 NOTE: You need to install the alfworld dataset in first: https://github.com/alfworld/alfworld
 """
+import argparse
 import glob
 import json
 import os
@@ -10,16 +11,13 @@
 random.seed(42)
 
 
-def create_dataset_files(output_dir, train_size=None, test_size=None):
-    # The ALFWORLD_DATA is the dataset path in the environment variable ALFWORLD_DATA, you need to set it when install alfworld dataset
-    from alfworld.info import ALFWORLD_DATA
-
+def create_dataset_files(game_data_path, output_dir, train_size=None, test_size=None):
     # get all matched game files from train and valid_seen directories
     train_game_files = glob.glob(
-        os.path.expanduser(f"{ALFWORLD_DATA}/json_2.1.1/train/*/*/game.tw-pddl")
+        os.path.expanduser(f"{game_data_path}/json_2.1.1/train/*/*/game.tw-pddl")
     )
     test_game_files = glob.glob(
-        os.path.expanduser(f"{ALFWORLD_DATA}/json_2.1.1/valid_seen/*/*/game.tw-pddl")
+        os.path.expanduser(f"{game_data_path}/json_2.1.1/valid_seen/*/*/game.tw-pddl")
     )
 
     # get absolute path
@@ -86,8 +84,28 @@ def create_dataset_files(output_dir, train_size=None, test_size=None):
 
 
 if __name__ == "__main__":
-    current_file_dir = os.path.dirname(os.path.abspath(__file__))
-    output_dir = f"{current_file_dir}/alfworld_data"
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--game_data_path", type=str, default=None, required=False)
+    parser.add_argument("--local_dir", type=str, default=None, required=False)
+    parser.add_argument("--train_size", type=int, default=None, required=False)
+    parser.add_argument("--test_size", type=int, default=None, required=False)
+    args = parser.parse_args()
+
+    if args.game_data_path is None:
+        # ALFWORLD_DATA is the dataset path in the environment variable
+        # you need to set it when install alfworld dataset
+        from alfworld.info import ALFWORLD_DATA
+
+        args.game_data_path = ALFWORLD_DATA
+
+    if args.local_dir is None:
+        current_file_dir = os.path.dirname(os.path.abspath(__file__))
+        args.local_dir = f"{current_file_dir}/alfworld_data"
+
     # use all data by default, or specify train_size and test_size if needed
-    create_dataset_files(output_dir)
-    # create_dataset_files(output_dir, train_size=1024, test_size=100) # use subset of data for testing
+    create_dataset_files(
+        game_data_path=args.game_data_path,
+        output_dir=args.local_dir,
+        train_size=args.train_size,
+        test_size=args.test_size,
+    )
diff --git a/examples/grpo_frozen_lake/get_frozen_lake_data.py b/examples/grpo_frozen_lake/get_frozen_lake_data.py
index 17b4aae87b..65c7e4102d 100644
--- a/examples/grpo_frozen_lake/get_frozen_lake_data.py
+++ b/examples/grpo_frozen_lake/get_frozen_lake_data.py
@@ -1,6 +1,7 @@
 """
 Modified from https://github.com/rllm-org/rllm/blob/main/examples/frozenlake/prepare_frozenlake_data.py
 """
+import argparse
 import os
 
 import numpy as np
@@ -8,46 +9,45 @@
 
 from trinity.common.constants import TASKSET_PATH_ENV_VAR
 
-path_from_env = os.environ.get(TASKSET_PATH_ENV_VAR)
-if path_from_env is not None:
-    DATA_ROOT_DIR = os.path.dirname(path_from_env)
-else:
-    DATA_ROOT_DIR = os.path.join(os.path.dirname(__file__), "data")
+DEFAULT_DATA_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "..", "data", "frozenlake"
+)
 
 
-def save_dataset_to_local(name: str, data: list[dict], split: str = "default") -> str:
-    """Save dataset directly to local DATA_PATH.
+def save_dataset_to_local(data_path: str, data: list[dict], split: str = "default") -> str:
+    """Save dataset directly to local data_path.
 
     Args:
-        name: Name of the dataset
+        data_path: Path to save the dataset
         data: List of dictionaries containing the dataset examples
         split: Split name (e.g., 'train', 'test', 'default')
 
     Returns:
         str: Path to the saved parquet file
     """
-    dataset_dir = os.path.join(DATA_ROOT_DIR, name)
-    os.makedirs(dataset_dir, exist_ok=True)
+    os.makedirs(data_path, exist_ok=True)
 
     # Convert to DataFrame and save
     data_df = pd.DataFrame(data)
-    dataset_path = os.path.join(dataset_dir, f"{split}.parquet")
+    dataset_path = os.path.join(data_path, f"{split}.parquet")
     data_df.to_parquet(dataset_path)
 
     print(
-        f"Saved dataset '{name}' split '{split}' with {len(data)} examples at {dataset_path}. Make sure to set the environment variable {TASKSET_PATH_ENV_VAR} to {DATA_ROOT_DIR}/{name}."
+        f"Saved dataset frozenlake split '{split}' with {len(data)} examples at {dataset_path}. Make sure to set the environment variable {TASKSET_PATH_ENV_VAR} to {data_path}."
     )
 
     return dataset_path
 
 
-def prepare_frozenlake_data(train_size=10000, test_size=100, map_max_size=6):
+def prepare_frozenlake_data(data_path, train_size=10000, test_size=100, map_max_size=6):
     """
     Prepare and save FrozenLake datasets for training and testing.
 
     Args:
+        data_path (str): Path to save the dataset
         train_size (int): Number of training examples to generate
         test_size (int): Number of test examples to generate
+        map_max_size (int): Maximum size of the map
 
     Returns:
         tuple: (train_data, test_data) - Lists of data dictionaries
@@ -78,14 +78,27 @@ def frozenlake_process_fn(seed, size, p, idx):
     ]
 
     # Save datasets directly to local DATA_PATH
-    save_dataset_to_local("frozenlake", train_data, "train")
-    save_dataset_to_local("frozenlake", test_data, "test")
+    save_dataset_to_local(data_path, train_data, "train")
+    save_dataset_to_local(data_path, test_data, "test")
 
     return train_data, test_data
 
 
 if __name__ == "__main__":
-    train_data, test_data = prepare_frozenlake_data()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=DEFAULT_DATA_PATH)
+    parser.add_argument("--train_size", type=int, default=10000)
+    parser.add_argument("--test_size", type=int, default=100)
+    parser.add_argument("--map_max_size", type=int, default=6)
+    args = parser.parse_args()
+
+    train_data, test_data = prepare_frozenlake_data(
+        data_path=args.local_dir,
+        train_size=args.train_size,
+        test_size=args.test_size,
+        map_max_size=args.map_max_size,
+    )
+
     print(f"Train dataset: {len(train_data)} examples")
     print(f"Test dataset: {len(test_data)} examples")
     print("Sample train example:", train_data[0])