MiroMindAI
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎config/agent_gaia-test.yaml‎
Lines changed: 75 additions & 0 deletions b/‎config/agent_gaia-test.yaml‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎config/benchmark/futurex.yaml‎
Lines changed: 20 additions & 0 deletions b/‎config/benchmark/futurex.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎config/benchmark/gaia-test.yaml‎
Lines changed: 16 additions & 0 deletions b/‎config/benchmark/gaia-test.yaml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/mkdocs/docs/download_datasets.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/mkdocs/docs/download_datasets.md‎
Lines changed: 2 additions & 0 deletions
@@ -208,6 +208,7 @@ marimo/_lsp/
 __marimo__/
 
 logs/
+tmp/
 
 data/*
 !data/README.md
 
@@ -0,0 +1,75 @@
+defaults:
+  - benchmark: gaia-test
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: -1  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    o3_hint: true
+  output_process:
+    o3_final_answer: true
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: -1  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
@@ -0,0 +1,20 @@
+# config/benchmark/futurex.yaml
+defaults:
+  - default
+  - _self_
+
+name: "futurex"
+
+data:
+  data_dir: "${data_dir}/futurex"  # Path to your dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# Set to skip evaluation since we don't have ground truth
+openai_api_key: "skip_evaluation"
+
@@ -0,0 +1,16 @@
+# config/benchmark/gaia-validation.yaml
+defaults:
+  - default
+  - _self_
+
+name: "gaia-test"
+
+data:
+  data_dir: "${data_dir}/gaia-test"
+
+execution:
+  max_tasks: null  # null means no limit
+  max_concurrent: 10
+  pass_at_k: 1
+
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
@@ -79,6 +79,7 @@ uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
 uv run main.py prepare-benchmark get xbench-ds
+uv run main.py prepare-benchmark get futurex
 ```
 
 ### What This Script Does
@@ -94,6 +95,7 @@ uv run main.py prepare-benchmark get xbench-ds
         - `browsecomp-zh-test` - Chinese BrowseComp test set
         - `hle` - HLE dataset
         - `xbench-ds` - xbench-DeepSearch dataset
+        - `futurex` - Futurex-Online dataset
 
 ### Customizing Dataset Selection