apply suggestions from reviews

chenyushuo · chenyushuo · commit 3d0f268e237c · 2025-12-01T11:56:55.000+08:00
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -77,10 +77,10 @@ python bench.py countdown --model_path /path/to/Qwen/Qwen2.5-1.5B-Instruct
 The chart below shows performance based on this [commit](https://github.com/modelscope/Trinity-RFT/tree/068da409d215bb2450d93b6b7a56740d4751669d).
 ![View Results](../docs/sphinx_doc/assets/countdown-bench.png)
 
-### 3. Guru
+### 3. Guru-Math
 To reproduce this experiment:
 ```bash
-python bench.py guru --model_path /path/to/Qwen/Qwen2.5-7B
+python bench.py guru_math --model_path /path/to/Qwen/Qwen2.5-7B
 ```
 
 #### Guru Results
diff --git a/benchmark/bench.py b/benchmark/bench.py
@@ -87,17 +87,16 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
         subprocess.CalledProcessError: If the generation script fails (due to check=True).
 
     Side Effects:
-        - Modifies `taskset_config` by setting the "path" key to the resolved path.
         - May create directories and files on disk via the external generation script.
         - Executes a subprocess to run the dataset generation script.
 
     Examples:
-        For dataset_name='guru' and taskset_config={"path": None},
+        For dataset_name='guru_math' and taskset_config={"path": None},
         this function will runs the following command and
-        generate the guru dataset to default location (DEFAULT_DATA_PATH in scripts/gen_guru_data.py):
+        generate the guru_math dataset to default location (DEFAULT_DATA_PATH in scripts/gen_guru_math_data.py):
 
         ```bash
-        python scripts/gen_guru_data.py --local_dir DEFAULT_DATA_PATH
+        python scripts/gen_guru_math_data.py --local_dir DEFAULT_DATA_PATH
         ```
     """
     if taskset_path:
@@ -108,7 +107,7 @@ def check_taskset_path(dataset_name: str, taskset_path: str) -> str:
 
     dataset_script_map = {
         "countdown": "gen_countdown_data.py",
-        "guru": "gen_guru_data.py",
+        "guru_math": "gen_guru_math_data.py",
     }
     if dataset_name not in dataset_script_map:
         raise ValueError(
@@ -223,16 +222,21 @@ def main(args):
         dist.barrier()
         dist.destroy_process_group()
         cmd_list.append("--dlc")
-    if args.dataset == "guru":
-        base_path = os.path.dirname(os.path.abspath(__file__))
+
+    # load plugins
+    base_path = os.path.dirname(os.path.abspath(__file__))
+    plugin_dir = os.path.join(base_path, "plugins", args.dataset)
+    if os.path.exists(plugin_dir):
         cmd_list.append("--plugin-dir")
-        cmd_list.append(os.path.join(base_path, "plugins"))
+        cmd_list.append(plugin_dir)
+
+    # run command
     subprocess.run(cmd_list, check=True)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru"])
+    parser.add_argument("dataset", type=str.lower, choices=["gsm8k", "countdown", "guru_math"])
     parser.add_argument(
         "--dlc", action="store_true", help="Specify when running in Aliyun PAI DLC."
     )
diff --git a/benchmark/plugins/guru_math/naive_dapo.py b/benchmark/plugins/guru_math/naive_dapo.py
@@ -485,7 +485,7 @@ def compute_score(solution_str: str, ground_truth: str, extra_info: dict) -> dic
         extra_info: dict with additional info for the score computation
 
     Returns:
-        Reward score (1.0 for correct, -1.0 for incorrect)
+        Reward score (1.0 for correct, 0.0 for incorrect)
     """
     # First assert intended generation and gt type
     model_output = str(solution_str)
@@ -513,7 +513,6 @@ def compute_score(solution_str: str, ground_truth: str, extra_info: dict) -> dic
         except Exception:
             correct = False
 
-    # reward = 1.0 if correct else -1.0
     reward = 1.0 if correct else 0.0
     acc = correct
 
diff --git a/benchmark/plugins/guru_math/reward.py b/benchmark/plugins/guru_math/reward.py
diff --git a/benchmark/scripts/gen_guru_math_data.py b/benchmark/scripts/gen_guru_math_data.py
@@ -4,7 +4,9 @@
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
 
-DEFAULT_DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "data", "guru")
+DEFAULT_DATA_PATH = os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), "..", "data", "guru_math"
+)
 
 
 def process_fn(example, idx):