diff --git a/config/agent_prompts/main_gaia.py b/config/agent_prompts/main_gaia.py deleted file mode 100644 index 569e7260..00000000 --- a/config/agent_prompts/main_gaia.py +++ /dev/null @@ -1,185 +0,0 @@ -from config.agent_prompts.base_agent_prompt import BaseAgentPrompt -import datetime -from typing import Any - - -class MainAgentGaiaPrompt(BaseAgentPrompt): - """ - MainAgentGaiaPrompt inherits from BaseAgentPrompt and can be extended - with main agent-specific prompt logic or configuration. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.is_main_agent = True - - def generate_system_prompt_with_mcp_tools( - self, mcp_servers: list[Any], chinese_context: bool = False - ) -> str: - formatted_date = datetime.datetime.today().strftime("%Y-%m-%d") - - # Basic system prompt - prompt = f"""In this environment you have access to a set of tools you can use to answer the user's question. - -You only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use. Today is: {formatted_date} - -# Tool-Use Formatting Instructions - -Tool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags. - -The Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`. - -Description: -Request to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters. - -Parameters: -- server_name: (required) The name of the MCP server providing the tool -- tool_name: (required) The name of the tool to execute -- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON - -Usage: - -server name here -tool name here - -{{ -"param1": "value1", -"param2": "value2 \\"escaped string\\"" -}} - - - -Important Notes: -- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags. -- Always adhere to this format for the tool use to ensure proper parsing and execution. - -String and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions. -Here are the functions available in JSONSchema format: - -""" - - # Add MCP servers section - if mcp_servers and len(mcp_servers) > 0: - for server in mcp_servers: - prompt += f"## Server name: {server['name']}\n" - - if "tools" in server and len(server["tools"]) > 0: - for tool in server["tools"]: - # Skip tools that failed to load (they only have 'error' key) - if "error" in tool and "name" not in tool: - continue - prompt += f"### Tool name: {tool['name']}\n" - prompt += f"Description: {tool['description']}\n" - prompt += f"Input JSON schema: {tool['schema']}\n" - - # Add the full objective system prompt - prompt += """ -# General Objective - -You accomplish a given task iteratively, breaking it down into clear steps and working through them methodically. - -## Task Strategy - -1. Analyze the user's request and set clear, achievable sub-goals. Prioritize these sub-goals in a logical order. -2. Start with a concise, numbered, step-by-step plan (e.g., 1., 2., 3.) outlining how you will solve the task before taking any action. Each sub-goal should correspond to a distinct step in your task-solving process. -3. Work through these sub-goals sequentially. After each step, carefully review and extract all potentially relevant information, details, or implications from the tool result before proceeding. The user may provide tool-use feedback, reflect on the results, and revise your plan if needed. If you encounter new information or challenges, adjust your approach accordingly. Revisit previous steps to ensure earlier sub-goals or clues have not been overlooked or missed. -4. You have access to a wide range of powerful tools. Use them strategically to accomplish each sub-goal. - -## Tool-Use Guidelines - -1. **IMPORTANT: Each step must involve exactly ONE tool call only, unless the task is already solved. You are strictly prohibited from making multiple tool calls in a single response.** -2. Before each tool call: -- Briefly summarize and analyze what is currently known. -- Identify what is missing, uncertain, or unreliable. -- Be concise; do not repeat the same analysis across steps. -- Choose the most relevant tool for the current sub-goal, and explain why this tool is necessary at this point. -- Verify whether all required parameters are either explicitly provided or can be clearly and reasonably inferred from context. -- Do not guess or use placeholder values for missing inputs. -- Skip optional parameters unless they are explicitly specified. -3. All tool queries must include full, self-contained context. Tools do not retain memory between calls. Include all relevant information from earlier steps in each query. -4. Avoid broad, vague, or speculative queries. Every tool call should aim to retrieve new, actionable information that clearly advances the task. -5. **For historical or time-specific content**: Regular search engines return current webpage content, not historical content. Archived webpage search is essential for retrieving content as it appeared in the past, use related tools to search for the historical content. -6. Even if a tool result does not directly answer the question, thoroughly extract and summarize all partial information, important details, patterns, constraints, or keywords that may help guide future steps. Never proceed to the next step without first ensuring that all significant insights from the current result have been fully considered. - -## Tool-Use Communication Rules - -1. **CRITICAL: After issuing exactly ONE tool call, STOP your response immediately. You must never make multiple tool calls in a single response. Do not include tool results, do not assume what the results will be, and do not continue with additional analysis or tool calls. The user will provide the actual tool results in their next message.** -2. Do not present the final answer until the entire task is complete. -3. Do not mention tool names. -4. Do not engage in unnecessary back-and-forth or end with vague offers of help. Do not end your responses with questions or generic prompts. -5. Do not use tools that do not exist. -6. Unless otherwise requested, respond in the same language as the user's message. -7. If the task does not require tool use, answer the user directly. - -""" - - # Add Chinese-specific instructions if enabled - if chinese_context: - prompt += """ - ## 中文语境处理指导 - - 当处理中文相关的任务时: - 1. **子任务委托 (Subtask Delegation)**:向worker代理委托的子任务应使用中文描述,确保任务内容准确传达 - 2. **搜索策略 (Search Strategy)**:搜索关键词应使用中文,以获取更准确的中文内容和信息 - 3. **问题分析 (Question Analysis)**:对中文问题的分析和理解应保持中文语境 - 4. **思考过程 (Thinking Process)**:内部分析、推理、总结等思考过程都应使用中文,保持语义表达的一致性 - 5. **信息整理 (Information Organization)**:从中文资源获取的信息应保持中文原文,避免不必要的翻译 - 6. **各种输出 (All Outputs)**:所有输出内容包括步骤说明、状态更新、中间结果等都应使用中文 - 7. **最终答案 (Final Answer)**:对于中文语境的问题,最终答案应使用中文回应 - - """ - - return prompt - - def generate_summarize_prompt( - self, - task_description: str, - task_failed: bool = False, - chinese_context: bool = False, - ) -> str: - summarize_prompt = ( - ( - "This is a direct instruction to you (the assistant), not the result of a tool call.\n\n" - ) - + ( - "**Important: You have either exhausted the context token limit or reached the maximum number of interaction turns without arriving at a conclusive answer. Therefore, you failed to complete the task. You Must explicitly state that you failed to complete the task in your response.**\n\n" - if task_failed - else "" - ) - + ( - "We are now ending this session, and your conversation history will be deleted. " - "You must NOT initiate any further tool use. This is your final opportunity to report " - "*all* of the information gathered during the session.\n\n" - "Summarize the above conversation, and output the FINAL ANSWER to the original question.\n\n" - "If a clear answer has already been provided earlier in the conversation, do not rethink or recalculate it — " - "simply extract that answer and reformat it to match the required format below.\n" - "If a definitive answer could not be determined, make a well-informed educated guess based on the conversation.\n\n" - "The original question is repeated here for reference:\n\n" - f"---\n{task_description}\n---\n\n" - "Summarize ALL working history for this task, including your step-by-step thoughts, all tool calls, and all tool results (i.e., the full solving trajectory so far).\n" - "Output the FINAL ANSWER and detailed supporting information of the task given to you.\n\n" - "If you found any useful facts, data, or quotes directly relevant to the original task, include them clearly and completely.\n" - "If you reached a conclusion or answer, include it as part of the response.\n" - "If the task could not be fully answered, return all partially relevant findings, search results, quotes, and observations that might help a downstream agent solve the problem.\n" - "If partial, conflicting, or inconclusive information was found, clearly indicate this in your response.\n\n" - "Your final response should be a clear, complete, and structured report.\n" - "Organize the content into logical sections with appropriate headings.\n" - "Do NOT include any tool call instructions, speculative filler, or vague summaries.\n" - "Focus on factual, specific, and well-organized information." - ) - ) - - # Add Chinese-specific summary instructions - if chinese_context: - summarize_prompt += """ - -## 中文总结要求 - -如果原始问题涉及中文语境: -- **总结语言**:使用中文进行总结和回答 -- **思考过程**:回顾和总结思考过程时也应使用中文表达 -- **信息组织**:保持中文信息的原始格式和表达方式 -- **过程描述**:对工作历史、步骤描述、结果分析等各种输出都应使用中文 -- **最终答案**:确保最终答案符合中文表达习惯和用户期望 -""" - return summarize_prompt diff --git a/config/agent_xbench-ds.yaml b/config/agent_xbench-ds.yaml new file mode 100644 index 00000000..13b2ddcc --- /dev/null +++ b/config/agent_xbench-ds.yaml @@ -0,0 +1,75 @@ +defaults: + - benchmark: xbench-ds + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + o3_hint: true + output_process: + o3_final_answer: true + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "true" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: -1 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + diff --git a/config/benchmark/xbench-ds.yaml b/config/benchmark/xbench-ds.yaml new file mode 100644 index 00000000..0721a541 --- /dev/null +++ b/config/benchmark/xbench-ds.yaml @@ -0,0 +1,16 @@ +# config/benchmark/xbench-ds.yaml +defaults: + - default + - _self_ + +name: "xbench-ds" + +data: + data_dir: "${data_dir}/xbench-ds" + +execution: + max_tasks: null # null means no limit + max_concurrent: 10 + pass_at_k: 1 + +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md new file mode 100644 index 00000000..d5f5c21c --- /dev/null +++ b/docs/mkdocs/docs/xbench_ds.md @@ -0,0 +1,116 @@ +# xbench-DeepSearch + +The **xbench** benchmark is an evaluation framework designed to measure both the intelligence frontier and real-world utility of AI agents. It consists of complementary tracks that test core model capabilities like reasoning, tool use, memory, and workflows grounded in business and professional settings. Its **DeepSearch** sub-track measures agents’ ability to conduct open-domain information retrieval, combining fact finding, comparison, and synthesis through multi-step search and tool use. + +See more details at [xbench official website](https://xbench.org/agi/aisearch) and [xbench-DeepSearch Eval Card](https://xbench.org/files/Eval%20Card%20xbench-DeepSearch.pdf). + + +--- + +## Setup and Evaluation Guide + +### Step 1: Download the xbench-DeepSearch Dataset + +**Direct Download (Recommended)** + +!!! tip "Dataset Setup" + Use the integrated prepare-benchmark command to download and process the dataset: + +```bash +uv run main.py prepare-benchmark get xbench-ds +``` + +By default, this will create the standardized dataset at data/xbench-ds/standardized_data.jsonl. + +### Step 2: Configure API Keys + +!!! warning "Required API Configuration" + Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys: + +```env title=".env Configuration" +# Search and web scraping capabilities +SERPER_API_KEY="your-serper-api-key" +JINA_API_KEY="your-jina-api-key" + +# Code execution environment +E2B_API_KEY="your-e2b-api-key" + +# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter) +OPENROUTER_API_KEY="your-openrouter-api-key" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + +# Vision understanding capabilities +ANTHROPIC_API_KEY="your-anthropic-api-key" +GEMINI_API_KEY="your-gemini-api-key" + +# LLM as judge, reasoning, and O3 hints +OPENAI_API_KEY="your-openai-api-key" +OPENAI_BASE_URL="https://api.openai.com/v1" +``` + +### Step 3: Run the Evaluation + +```bash +uv run main.py common-benchmark \ + --config_file_name=agent_xbench-ds \ + output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")" +``` + +### Step 4: Monitor Progress and Resume + +!!! tip "Progress Tracking" + You can monitor the evaluation progress in real-time: + +```bash title="Check Progress" +uv run utils/progress_check/check_xbench_progress.py $PATH_TO_LOG +``` + +Replace `$PATH_TO_LOG` with your actual output directory path. + +!!! note "Resume Capability" + If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory: + +```bash title="Resume Interrupted Evaluation" +uv run main.py common-benchmark \ + --config_file_name=agent_xbench-ds \ + output_dir="logs/xbench-ds/20250922_1430" +``` + +--- + +## Post-Processing for Enhanced Performance + +!!! tip "Test-Time Scaling for Improved Reliability" + Test-time scaling can significantly improve the reliability of model responses. Instead of simple majority voting, we employ a comprehensive **parallel thinking** approach that: + + - Aggregates final summary steps from each agent run before outputting results + - Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria + - Provides more robust and accurate final answers + +Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance. + +```bash title="Multiple runs with parallel thinking post-processing" +bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh +``` + +### Running Parallel Thinking Analysis alone + +After completing evaluations (single or multiple runs), you can apply parallel thinking post-processing to aggregate and generate the final result. + +```bash title="Parallel Thinking Post-Processing" +uv run utils/util_llm_parallel_thinking.py \ + --benchmark xbench-ds \ + --results_dir "logs/xbench-ds/20250922_1430" +``` + +The program automatically reads results from each run in the specified directory and performs aggregated analysis. The final output files are generated in the `results_dir`: + +- **`llm_parallel_thinking_Nruns.json`** - Detailed analysis results +- **`llm_parallel_thinking_accuracy_Nruns.txt`** - Final accuracy + +Where `N` represents the total number of experimental runs (**minimum of 1**). + +--- + +!!! info "Documentation Info" + **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI \ No newline at end of file diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index f508e144..70f69051 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -53,6 +53,7 @@ nav: - GAIA-Validation-Text-Only: gaia_validation_text_only.md - GAIA-Test: gaia_test.md - FutureX: futurex.md + - xBench-DeepSearch: xbench_ds.md - Download Datasets: download_datasets.md - Add New Benchmarks: contribute_benchmarks.md diff --git a/scripts/run_evaluate_multiple_runs_xbench-ds.sh b/scripts/run_evaluate_multiple_runs_xbench-ds.sh index 263ff13d..4b55e626 100644 --- a/scripts/run_evaluate_multiple_runs_xbench-ds.sh +++ b/scripts/run_evaluate_multiple_runs_xbench-ds.sh @@ -4,31 +4,23 @@ # # SPDX-License-Identifier: Apache-2.0 -# Configuration parameters - dual model configuration +# Configuration parameters NUM_RUNS=3 -MAX_CONCURRENT=20 +AGENT_SET="agent_quickstart_1" BENCHMARK_NAME="xbench-ds" -AGENT_SET="claude03_claude_dual" -ADD_MESSAGE_ID="true" # Set to true to add random message ID to all messages sent to LLM -MAX_TURNS=-1 +MAX_CONCURRENT=5 +export CHINESE_CONTEXT="true" -# Automatically set Chinese context - if BENCHMARK_NAME contains xbench or -zh -if [[ $BENCHMARK_NAME == "xbench-ds" ]] || [[ $BENCHMARK_NAME == "browsecomp-zh" ]]; then - export CHINESE_CONTEXT="true" - echo "检测到中文相关基准测试,已启用中文上下文:CHINESE_CONTEXT=true" -fi - -# export REMOVE_SNIPPETS="true" -# export REMOVE_KNOWLEDGE_GRAPH="true" -# export REMOVE_ANSWER_BOX="true" +# Set results directory with timestamp +TIMESTAMP=$(date +%Y%m%d_%H%M) +RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"} export LOGGER_LEVEL="INFO" -RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}" - echo "Starting $NUM_RUNS runs of the evaluation..." echo "Results will be saved in: $RESULTS_DIR" +# Create results directory mkdir -p "$RESULTS_DIR" for i in $(seq 1 $NUM_RUNS); do @@ -40,11 +32,8 @@ for i in $(seq 1 $NUM_RUNS); do ( uv run main.py common-benchmark \ + --config_file_name=$AGENT_SET \ benchmark=$BENCHMARK_NAME \ - agent=$AGENT_SET \ - agent.add_message_id=$ADD_MESSAGE_ID \ - agent.main_agent.max_turns=$MAX_TURNS \ - agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \ benchmark.execution.max_tasks=null \ benchmark.execution.max_concurrent=$MAX_CONCURRENT \ benchmark.execution.pass_at_k=1 \ @@ -84,4 +73,18 @@ echo "==========================================" echo "Multiple runs evaluation completed!" echo "Check results in: $RESULTS_DIR" echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" -echo "==========================================" \ No newline at end of file +echo "==========================================" + + +echo "==========================================" +echo "Parallel thinking post-processing" +echo "==========================================" + +# Parallel thinking post-processing +uv run utils/util_llm_parallel_thinking.py \ + --benchmark xbench-ds \ + --results_dir "$RESULTS_DIR" + +echo "==========================================" +echo "Parallel thinking post-processing completed!" +echo "==========================================" \ No newline at end of file diff --git a/scripts/run_evaluate_sinlge_run_xbench-ds.sh b/scripts/run_evaluate_sinlge_run_xbench-ds.sh new file mode 100644 index 00000000..20d3232f --- /dev/null +++ b/scripts/run_evaluate_sinlge_run_xbench-ds.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"} +echo "Results will be saved in: $RESULTS_DIR" + +export CHINESE_CONTEXT="true" + +uv run main.py common-benchmark \ + --config_file_name=agent_quickstart_1 \ + benchmark=xbench-ds \ + output_dir=$RESULTS_DIR \ No newline at end of file diff --git a/src/tool/mcp_servers/reading_mcp_server.py b/src/tool/mcp_servers/reading_mcp_server.py index ea242eb7..c0f6ec03 100644 --- a/src/tool/mcp_servers/reading_mcp_server.py +++ b/src/tool/mcp_servers/reading_mcp_server.py @@ -16,7 +16,8 @@ # Initialize FastMCP server mcp = FastMCP("reading-mcp-server") - +SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "") +JINA_API_KEY = os.environ.get("JINA_API_KEY", "") @mcp.tool() async def read_file(uri: str) -> str: @@ -64,7 +65,7 @@ async def read_file(uri: str) -> str: if retry_count > 3: # Try scrape_website tool as fallback try: - scrape_result = await smart_request(uri) + scrape_result = await smart_request(uri, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY}) return f"[INFO]: Download failed, automatically tried `scrape_website` tool instead.\n\n{scrape_result}" except Exception as scrape_error: return f"[ERROR]: Failed to download {uri}: {e}. Also failed to scrape with `scrape_website` tool: {scrape_error}" @@ -91,7 +92,8 @@ def _cleanup_tempfile(path): arguments = {"uri": uri} server_params = StdioServerParameters( - command="markitdown-mcp", + command="uv", + args=["run", "--active", "--", "markitdown-mcp"], ) result_content = "" diff --git a/utils/eval_utils.py b/utils/eval_utils.py index 4529e56e..f4067850 100644 --- a/utils/eval_utils.py +++ b/utils/eval_utils.py @@ -112,7 +112,7 @@ async def verify_answer_llm_simpleqa( CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"} llm_response = await openai_client.chat.completions.create( - model="gpt-4o-mini", messages=messages, max_completion_tokens=2 + model="gpt-4o-mini", messages=messages, max_completion_tokens=2, temperature=0.0 ) content = llm_response.choices[0].message.content match = re.search(r"(A|B|C)", content) diff --git a/utils/progress_check/check_xbench_progress.py b/utils/progress_check/check_xbench_progress.py new file mode 100644 index 00000000..71224547 --- /dev/null +++ b/utils/progress_check/check_xbench_progress.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +""" +xbench-DeepSearch Progress Checker + +This script analyzes xbench-DeepSearch benchmark results in a log folder to count: +- Total files processed +- Files with status "completed" +- Files with predictions (final_boxed_answer) +- Files with errors + +Usage: + python check_xbench_progress.py [LOG_FOLDER_PATH] + +If no path is provided, uses the current directory. +""" + +import json +import sys +from pathlib import Path +from typing import Dict, List, Tuple + + +def analyze_xbench_results(log_folder: str) -> Dict[str, int]: + """ + Analyze xbench-DeepSearch benchmark results from JSON log files. + + Args: + log_folder: Path to folder containing task_*.json files + + Returns: + Dictionary with counts of different categories + """ + log_path = Path(log_folder) + + if not log_path.exists(): + raise FileNotFoundError(f"Log folder not found: {log_folder}") + + # Find all task JSON files + json_files = list(log_path.glob("task_*_attempt_*.json")) + + results = { + "total_files": 0, + "completed_status": 0, + "running_status": 0, + "failed_status": 0, + "with_predictions": 0, + "without_predictions": 0, + "with_errors": 0, + "parse_errors": 0, + } + + completed_files = [] + running_files = [] + failed_files = [] + prediction_files = [] + error_files = [] + parse_error_files = [] + + print(f"Scanning {len(json_files)} files in {log_folder}...") + + for json_file in json_files: + results["total_files"] += 1 + + try: + with open(json_file, "r", encoding="utf-8") as f: + data = json.load(f) + + status = data.get("status", "").lower() + final_answer = data.get("final_boxed_answer", "") + error_msg = data.get("error", "") + + # Count by status + if status == "completed": + results["completed_status"] += 1 + completed_files.append(json_file.name) + elif status == "running": + results["running_status"] += 1 + running_files.append(json_file.name) + elif status in ["failed", "error"]: + results["failed_status"] += 1 + failed_files.append(json_file.name) + else: + # Unknown status + results["failed_status"] += 1 + failed_files.append((json_file.name, f"Unknown status: {status}")) + + # Count by prediction availability + if final_answer and final_answer.strip(): + results["with_predictions"] += 1 + prediction_files.append( + ( + json_file.name, + final_answer[:100] + "..." + if len(final_answer) > 100 + else final_answer, + ) + ) + else: + results["without_predictions"] += 1 + + # Count by error presence + if error_msg and error_msg.strip(): + results["with_errors"] += 1 + error_files.append((json_file.name, error_msg)) + + except (json.JSONDecodeError, KeyError, FileNotFoundError) as e: + results["parse_errors"] += 1 + parse_error_files.append((json_file.name, str(e))) + print(f"Error parsing {json_file.name}: {e}") + + return ( + results, + completed_files, + running_files, + failed_files, + prediction_files, + error_files, + parse_error_files, + ) + + +def display_results( + results: Dict[str, int], + completed_files: List[str], + running_files: List[str], + failed_files: List[str], + prediction_files: List[Tuple[str, str]], + error_files: List[Tuple[str, str]], + parse_error_files: List[Tuple[str, str]], +) -> None: + """Display the analysis results in a formatted way.""" + + print("\n" + "=" * 60) + print("xbench-DeepSearch BENCHMARK RESULTS SUMMARY") + print("=" * 60) + + total = results["total_files"] + completed = results["completed_status"] + running = results["running_status"] + failed = results["failed_status"] + with_predictions = results["with_predictions"] + with_errors = results["with_errors"] + + print(f"Total files processed: {total:3d}") + print( + f"Files with status 'completed': {completed:3d} ({completed/total*100:.1f}%)" + ) + print(f"Files with status 'running': {running:3d} ({running/total*100:.1f}%)") + print(f"Files with status 'failed': {failed:3d} ({failed/total*100:.1f}%)") + print( + f"Files with predictions: {with_predictions:3d} ({with_predictions/total*100:.1f}%)" + ) + print( + f"Files with errors: {with_errors:3d} ({with_errors/total*100:.1f}%)" + ) + print(f"Files with parse errors: {results['parse_errors']:3d}") + + if completed > 0: + prediction_rate = with_predictions / completed * 100 + print(f"\nPrediction rate (predictions/completed): {prediction_rate:.1f}%") + + print("\n" + "-" * 60) + print(f"SUMMARY: {completed} tasks completed, {with_predictions} with predictions") + print("-" * 60) + + # Show some example files for verification + if completed_files: + print("\nFirst 5 completed files:") + for i, filename in enumerate(completed_files[:5], 1): + print(f" {i}. {filename}") + if len(completed_files) > 5: + print(f" ... and {len(completed_files) - 5} more") + + if running_files: + print("\nFirst 5 running files:") + for i, filename in enumerate(running_files[:5], 1): + print(f" {i}. {filename}") + if len(running_files) > 5: + print(f" ... and {len(running_files) - 5} more") + + if prediction_files: + print("\nFirst 5 files with predictions:") + for i, (filename, prediction) in enumerate(prediction_files[:5], 1): + print(f" {i}. {filename}") + print(f" Prediction: {prediction}") + if len(prediction_files) > 5: + print(f" ... and {len(prediction_files) - 5} more") + + if error_files: + print("\nFiles with errors:") + for filename, error in error_files[:5]: + print(f" - {filename}: {error[:100]}...") + if len(error_files) > 5: + print(f" ... and {len(error_files) - 5} more") + + if parse_error_files: + print("\nFiles with parse errors:") + for filename, error in parse_error_files: + print(f" - {filename}: {error}") + + +def main(): + """Main function to run the analysis.""" + + # Check if folder path was provided as command line argument + if len(sys.argv) > 1: + log_folder = sys.argv[1] + print(f"Using provided folder path: {log_folder}") + else: + log_folder = "." + print(f"No folder path provided, using current directory: {log_folder}") + + try: + print(f"Analyzing xbench-DeepSearch benchmark results in: {log_folder}") + ( + results, + completed_files, + running_files, + failed_files, + prediction_files, + error_files, + parse_error_files, + ) = analyze_xbench_results(log_folder) + display_results( + results, + completed_files, + running_files, + failed_files, + prediction_files, + error_files, + parse_error_files, + ) + + except Exception as e: + print(f"Error: {e}") + print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]") + print(f"Example: python {sys.argv[0]} logs/xbench-ds/claude03_claude_dual/run_1") + return 1 + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/utils/util_llm_parallel_thinking.py b/utils/util_llm_parallel_thinking.py index 7b5ede5c..f9b0b8af 100644 --- a/utils/util_llm_parallel_thinking.py +++ b/utils/util_llm_parallel_thinking.py @@ -18,6 +18,7 @@ from eval_utils import verify_answer_for_datasets from dotenv import load_dotenv +from argparse import ArgumentParser load_dotenv() @@ -35,7 +36,6 @@ class ExtractedAnswer(BaseModel): BENCHMARK_NAME = "gaia-validation" # Benchmark name for evaluation -RESULTS_DIRS = [""] DEFAULT_MODEL = "o3" OPENAI_BASE_URL = "https://api.openai.com/v1" @@ -373,16 +373,16 @@ def create_parallel_thinking_xbench_prompt( async def process_single_task( - task_id: str, data: List[Dict[str, Any]], n_runs: int, semaphore: asyncio.Semaphore + benchmark_name: str, task_id: str, data: List[Dict[str, Any]], n_runs: int, semaphore: asyncio.Semaphore ) -> Tuple[str, Dict[str, Any], Any]: """Process a single task and return its result.""" # Choose prompt function based on benchmark - if "xbench" in BENCHMARK_NAME: + if "xbench" in benchmark_name: prompt = create_parallel_thinking_xbench_prompt(data, n_runs) - elif "gaia" in BENCHMARK_NAME: + elif "gaia" in benchmark_name: prompt = create_parallel_thinking_gaia_prompt(data, n_runs) else: - raise ValueError(f"Unsupported benchmark name: {BENCHMARK_NAME}") + raise ValueError(f"Unsupported benchmark name: {benchmark_name}") response, usage = await select_best_solution(prompt, n_runs, semaphore=semaphore) selected_solution = response["final_answer"] @@ -393,7 +393,7 @@ async def process_single_task( ) result = await verify_answer_for_datasets( - client, BENCHMARK_NAME, "", data[0]["ground_truth"], selected_solution + client, benchmark_name, "", data[0]["ground_truth"], selected_solution ) task_result = { @@ -411,9 +411,10 @@ async def process_single_task( async def process_tasks( + benchmark_name: str, task_score_dict: Dict[str, List[Dict[str, Any]]], n_runs: int, - max_concurrent_requests: int = MAX_CONCURRENT_REQUESTS, + max_concurrent_requests: int, ) -> Dict[str, Dict[str, Any]]: """Process all tasks concurrently and select best solutions.""" # Create semaphore for rate limiting @@ -421,7 +422,7 @@ async def process_tasks( # Create tasks for concurrent execution tasks = [ - process_single_task(task_id, data, n_runs, semaphore) + process_single_task(benchmark_name, task_id, data, n_runs, semaphore) for task_id, data in task_score_dict.items() ] @@ -564,7 +565,7 @@ def save_results( async def main( - results_dir: str, max_concurrent_requests: int = MAX_CONCURRENT_REQUESTS + benchmark_name: str, results_dir: str, max_concurrent_requests: int = MAX_CONCURRENT_REQUESTS ) -> None: """Main function to analyze results and select best solutions.""" if not os.path.exists(results_dir): @@ -574,7 +575,7 @@ async def main( print(f"Analyzing results from: {results_dir}") # Load task data from all runs - task_score_dict = load_task_data(results_dir) + task_score_dict = load_task_data(benchmark_name, results_dir) if not task_score_dict: print("No task data found") return @@ -584,17 +585,22 @@ async def main( n_runs = len([d for d in run_dirs if os.path.isdir(d)]) # Process all tasks - task_results = await process_tasks(task_score_dict, n_runs, max_concurrent_requests) + task_results = await process_tasks(benchmark_name, task_score_dict, n_runs, max_concurrent_requests) # Save results save_results(results_dir, task_results, n_runs) if __name__ == "__main__": - max_concurrent_requests = MAX_CONCURRENT_REQUESTS + args = ArgumentParser() + args.add_argument("--benchmark", type=str, default="gaia", choices=["gaia", "xbench-ds"]) + args.add_argument("--results_dirs", type=str, default=[]) + args.add_argument("--max_concurrent_requests", type=int, default=25) + args = args.parse_args() - # Use single or multiple directory mode based on whether results_dirs is defined above - results_dirs = RESULTS_DIRS + benchmark_name = args.benchmark + max_concurrent_requests = args.max_concurrent_requests + results_dirs = list(args.results_dirs.split(",")) # Use single or multiple directory mode based on whether results_dirs is defined above if results_dirs: # Multiple directories mode @@ -608,7 +614,7 @@ async def main( async def main_combined(): task_results = await process_tasks( - combined_dict, total_runs, max_concurrent_requests + benchmark_name, combined_dict, total_runs, max_concurrent_requests ) save_results(os.path.dirname(results_dirs[0]), task_results, total_runs)