-
Notifications
You must be signed in to change notification settings - Fork 62
Open
Description
Output.jsonl
{"task_id": "scenario_universe_27_f1g6o1", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_27_f1g6o1_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_27_f1g6o1", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: 17\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-b31d5a58-5d64-47cd-b497-56e7a651dcb7) with oracle event (ID: OracleEvent-AGENT-b33a80e3-ca77-494a-9614-964c4f241066), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-1e61435c-224d-41c0-9484-9148124687e7) with oracle event (ID: OracleEvent-AGENT-b33a80e3-ca77-494a-9614-964c4f241066), reason: tool judge reject"}}
{"task_id": "scenario_universe_26_f4zp2l", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_26_f4zp2l_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_26_f4zp2l", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: Lysverket\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-d22a9098-8f8e-4cd8-a13b-22ef69f5b90f) with oracle event (ID: OracleEvent-AGENT-b22c59d7-9b0a-4e92-8151-c3b7acf27718), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-7f090cba-f859-4284-be4d-d0532fe26c50) with oracle event (ID: OracleEvent-AGENT-b22c59d7-9b0a-4e92-8151-c3b7acf27718), reason: tool judge reject"}}
{"task_id": "scenario_universe_23_5xzkat", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_23_5xzkat_28df1839.json", "score": 1.0, "metadata": {"scenario_id": "scenario_universe_23_5xzkat", "status": "success", "has_exception": false, "rationale": "None"}}
{"task_id": "scenario_universe_24_lutben", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_24_lutben_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_24_lutben", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: Swiss\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-9b199e7c-0b9f-419d-aa9f-8d87ac5bf648) with oracle event (ID: OracleEvent-AGENT-8a63cabe-09e9-4f95-a52c-547a4ffdd7ec), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-ad69a220-1c5e-4600-8676-8e1cee6d8a96) with oracle event (ID: OracleEvent-AGENT-8a63cabe-09e9-4f95-a52c-547a4ffdd7ec), reason: tool judge reject"}}
{"task_id": "scenario_universe_24_yc23gu", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_24_yc23gu_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_24_yc23gu", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_24_as82zr", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_24_as82zr_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_24_as82zr", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_25_rj6iel", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_25_rj6iel_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_25_rj6iel", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_25_ox2z4y", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_25_ox2z4y_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_25_ox2z4y", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_23_ftmujb", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_23_ftmujb_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_23_ftmujb", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_25_trmrih", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_25_trmrih_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_25_trmrih", "status": "failed", "has_exception": true, "exception_type": "LLMEngineException", "exception_message": "Auth error in litellm.", "rationale": "Exception"}}
{"task_id": "scenario_universe_26_5efcgz", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_26_5efcgz_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_26_5efcgz", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: Thailand\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-a97019e6-3141-417b-af6f-06eee8f10775) with oracle event (ID: OracleEvent-AGENT-a09d979c-3306-4014-8c03-2ef3d848ce61), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-7e7cadd0-5c8b-4aa4-b8eb-a2eb8932b850) with oracle event (ID: OracleEvent-AGENT-a09d979c-3306-4014-8c03-2ef3d848ce61), reason: tool judge reject"}}
{"task_id": "scenario_universe_23_gu6jrl", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_23_gu6jrl_28df1839.json", "score": 1.0, "metadata": {"scenario_id": "scenario_universe_23_gu6jrl", "status": "success", "has_exception": false, "rationale": "None"}}
{"task_id": "scenario_universe_28_4sn4lc", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_28_4sn4lc_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_28_4sn4lc", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: 2700 sqft\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-e2c61d22-562b-4601-aacd-3098baadd870) with oracle event (ID: OracleEvent-AGENT-b4c804a1-c0d7-47d4-9a34-e2563b851597), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-54879b1c-e070-4362-9444-ddcaca9e00b2) with oracle event (ID: OracleEvent-AGENT-b4c804a1-c0d7-47d4-9a34-e2563b851597), reason: tool judge reject"}}
{"task_id": "scenario_universe_21_bnrehm", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_21_bnrehm_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_21_bnrehm", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: 23\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-fa6e5760-cb68-4168-9c9a-feb3558e7bb7) with oracle event (ID: OracleEvent-AGENT-3af0dea5-ff8b-4ab7-ad1f-bd9c85402b6d), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-9e12d759-fa1c-4bc8-b2f6-c117160dbf0f) with oracle event (ID: OracleEvent-AGENT-3af0dea5-ff8b-4ab7-ad1f-bd9c85402b6d), reason: tool judge reject"}}
{"task_id": "scenario_universe_23_ans8nx", "trace_id": "./runs/tmp_gaia2_results1_claude-sonnet-4-20250514/hf/scenario_universe_23_ans8nx_28df1839.json", "score": 0.0, "metadata": {"scenario_id": "scenario_universe_23_ans8nx", "status": "failed", "has_exception": false, "rationale": "Failure: Agent did not perform the following oracle tool call:\ntool name: AgentUserInterface__send_message_to_user\ntool args:\n-content: 22\n\nList of matching attempts:\n-Failure matching agent event (ID: Event-USER-19251e5d-b27c-4cd5-a475-da469594ec7a) with oracle event (ID: OracleEvent-AGENT-7b865b49-69df-437c-9430-ef451b063149), reason: already matched\n-Failure matching agent event (ID: AGENT-AgentUserInterface.send_message_to_user-ab36e9d8-8075-4078-af2f-0a89f025505d) with oracle event (ID: OracleEvent-AGENT-7b865b49-69df-437c-9430-ef451b063149), reason: tool judge reject"}}
benchmark_stats.jsonl
"metadata": {
"model": "claude-sonnet-4-20250514",
"model_provider": "anthropic",
"timestamp": "2025-10-24T11:26:03.882146",
"report_version": "3.0"
},
"statistics": {
"per_capability": {
"search": {
"capability": "search",
"total_runs": 15,
"validated_runs": 15,
"success_runs": 2,
"failed_runs": 13,
"exception_runs": 0,
"no_validation_runs": 0,
"success_rate": 13.333333333333334,
"success_rate_std": 0.0,
"success_rate_sem": 0.0,
"pass_at_k": 2,
"pass_at_k_percent": 13.333333333333334,
"pass_k": 2,
"pass_k_percent": 13.333333333333334,
"total_scenarios": 15,
"avg_run_duration": 173.2953727722168,
"avg_run_duration_std": 73.9041321360137
}
},
"global": {
"total_runs": 15,
"validated_runs": 15,
"success_runs": 2,
"failed_runs": 13,
"exception_runs": 0,
"no_validation_runs": 0,
"pass_at_k": 2,
"pass_at_k_percent": 13.333333333333334,
"pass_k": 2,
"pass_k_percent": 13.333333333333334,
"total_scenarios": 15,
"macro_success_rate": 13.333333333333334,
"macro_success_rate_std": 0.0,
"macro_success_rate_sem": 0.0,
"micro_success_rate": 13.333333333333334,
"micro_success_rate_std": 0.0,
"micro_success_rate_sem": 0.0,
"avg_run_duration": 173.2953727722168,
"avg_run_duration_std": 73.9041321360137,
"job_duration": 368.54944801330566
}
},
"run_configurations": [
{
"phase_name": "standard",
"config": "search",
"a2a_app_prop": "0.0",
"has_tool_augmentation": "False",
"has_env_events": "False",
"total_runs": 15,
"validated_runs": 15,
"success_runs": 2,
"failed_runs": 13,
"exception_runs": 0,
"no_validation_runs": 0
}
]
}
The exception count does not match in both.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels