feat(benchmark): add xbench-ds prep support and add_new_tools doc (#37)

xuan-dong-shanda · web-flow · commit 88c0528b0b03 · 2025-09-17T16:59:36.000+08:00
* add new tools doc, support xbench-ds benchmark preparation

* docs(prepare-benchmark): add xbench-ds
diff --git a/docs/mkdocs/docs/contribute_tools.md b/docs/mkdocs/docs/contribute_tools.md
@@ -1,7 +1,76 @@
+# Adding New Tools
 
-# - Coming Soon -
+## What This Does
+Extend the agent’s functionality by introducing a new tool. Each tool is implemented as an MCP server and registered via configuration.
 
+## Implementation Steps
+
+### 1. Create MCP Server
+Create a new file `src/tool/mcp_servers/new-mcp-server.py` that implements the tool’s core logic.  
+
+```python
+from fastmcp import FastMCP
+
+# Initialize FastMCP server
+mcp = FastMCP("new-mcp-server")
+
+@mcp.tool()
+async def tool_name(param: str) -> str:
+    """
+    Explanation of the tool, its parameters, and return value.
+    """
+    tool_result = ...  # Your logic here
+    return tool_result
+
+if __name__ == "__main__":
+    mcp.run(transport="stdio")
+```
+
+> Tool schemas are automatically generated from `docstrings` and `hints` via the FastMCP protocol.
+
+
+### 2. Create Tool Config
+Add a new config file at `config/tools/new-tool-name.yaml`:
+
+```yaml
+name: "new-tool-name"
+tool_command: "python"
+args:
+  - "-m"
+  - "src.tool.mcp_servers.new-mcp-server"  # Match the server file created above
+```
+
+
+### 3. Register Tool in Agent Config
+Enable the new tool inside your agent config (e.g., `config/agent-with-new-tool.yaml`):
+
+```yaml
+main_agent:
+  ...
+  tool_config:
+    - tool-reasoning
+    - new-tool-name   # 👈 Add your new tool here
+  ...
+sub_agents:
+  agent-worker:
+    ...
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+      - new-tool-name # 👈 Add your new tool here
+    ...
+```
+
+
+## Examples
+- `tool-reasoning` – reasoning utilities  
+- `tool-image-video` – visual understanding  
+- `new-tool-name` – your custom tool  
 
 ---
+
 **Last Updated:** Sep 2025  
-**Doc Contributor:** Team @ MiroMind AI
+**Doc Contributor:** Team @ MiroMind AI  
diff --git a/docs/mkdocs/docs/download_datasets.md b/docs/mkdocs/docs/download_datasets.md
@@ -78,6 +78,7 @@ uv run main.py prepare-benchmark get webwalkerqa
 uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
+uv run main.py prepare-benchmark get xbench-ds
 ```
 
 ### What This Script Does
@@ -92,6 +93,7 @@ uv run main.py prepare-benchmark get hle
         - `browsecomp-test` - English BrowseComp test set
         - `browsecomp-zh-test` - Chinese BrowseComp test set
         - `hle` - HLE dataset
+        - `xbench-ds` - xbench-DeepSearch dataset
 
 ### Customizing Dataset Selection
 
diff --git a/scripts/run_evaluate_multiple_runs_nohintreason_hle.sh b/scripts/run_evaluate_multiple_runs_nohintreason_hle.sh
@@ -48,6 +48,7 @@ for i in $(seq 1 $NUM_RUNS); do
             benchmark.execution.max_tasks=null \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             benchmark.execution.pass_at_k=1 \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
diff --git a/scripts/run_evaluate_multiple_runs_nosandbox_gaia-validation.sh b/scripts/run_evaluate_multiple_runs_nosandbox_gaia-validation.sh
@@ -48,6 +48,7 @@ for i in $(seq 1 $NUM_RUNS); do
             benchmark.execution.max_tasks=null \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             benchmark.execution.pass_at_k=1 \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
diff --git a/scripts/run_evaluate_multiple_runs_xbench-ds.sh b/scripts/run_evaluate_multiple_runs_xbench-ds.sh
@@ -48,6 +48,7 @@ for i in $(seq 1 $NUM_RUNS); do
             benchmark.execution.max_tasks=null \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             benchmark.execution.pass_at_k=1 \
+            output_dir="$RESULTS_DIR/$RUN_ID" \
             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
@@ -19,4 +19,5 @@ uv run main.py prepare-benchmark get frames-test
 uv run main.py prepare-benchmark get webwalkerqa
 uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
-uv run main.py prepare-benchmark get hle
+uv run main.py prepare-benchmark get hle
+uv run main.py prepare-benchmark get xbench-ds
diff --git a/utils/prepare_benchmark/gen_xbench_ds.py b/utils/prepare_benchmark/gen_xbench_ds.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from utils.prepare_benchmark.common import Task
+
+
+def xor_decrypt(data, key):
+    """
+    XOR decrypt data with a key
+    """
+    key_bytes = key.encode('utf-8')
+    key_length = len(key_bytes)
+    return bytes([data[i] ^ key_bytes[i % key_length] for i in range(len(data))])
+
+def gen_xbench_ds(hf_token: str) -> Generator[Task, None, None]:
+    dataset = load_dataset(
+        "xbench/DeepSearch",
+        split="train",
+    )
+    for x in dataset:
+        metadata: MutableMapping = x  # type: ignore
+        task_id = metadata.pop("id")
+
+        key = metadata.pop("canary")
+        prompt = xor_decrypt(base64.b64decode(metadata.pop("prompt")), key).decode('utf-8')
+        answer = xor_decrypt(base64.b64decode(metadata.pop("answer")), key).decode('utf-8')
+        reference_steps = xor_decrypt(base64.b64decode(metadata.pop("reference_steps")), key).decode('utf-8')
+        task = Task(
+            task_id=task_id,
+            task_question=prompt,
+            ground_truth=answer,
+            file_path=None,
+            metadata={"reference_steps": reference_steps},
+        )
+        yield task
+
+    return
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
@@ -17,6 +17,7 @@
 from utils.prepare_benchmark.gen_gaia_text_only import gen_gaia_text_only
 from utils.prepare_benchmark.gen_hle import gen_hle_test
 from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa
+from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds
 
 
 @dataclasses.dataclass
@@ -29,6 +30,7 @@ class _Env:
         "browsecomp-test",
         "browsecomp-zh-test",
         "hle",
+        "xbench-ds",
     )
     meta_filename = "standardized_data.jsonl"
     data_dir: pathlib.Path
@@ -99,6 +101,13 @@ def gen():
                 for x in gen_hle_test(env.hf_token, env.data_dir):
                     yield x
 
+            return gen
+        case "xbench-ds":
+
+            def gen():
+                for x in gen_xbench_ds(env.hf_token):
+                    yield x
+
             return gen
         case _:
             raise ValueError("not supported")