From 507c2cf3139bdb945bf72a6a48ed25f93651a13c Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Thu, 12 Mar 2026 21:05:08 +0000 Subject: [PATCH 01/13] commit stuff that's ok to commit --- dvc.lock | 117 +++++++++++++++++++++++++++---------------------------- 1 file changed, 58 insertions(+), 59 deletions(-) diff --git a/dvc.lock b/dvc.lock index 53110fd4..6811b150 100644 --- a/dvc.lock +++ b/dvc.lock @@ -594,13 +594,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver - -T split=dev -T scorer_model=google/gemini-2.5-pro --limit=1000 --log-shared + -T split=dev -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 @@ -644,18 +644,18 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=dev - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 8d08ed8249687b3d41a8e06d7533c852 - size: 4949142 + md5: 336b302907ace455f3b7457eaaee68b2 + size: 4445686 score_all_solvers@anthropic/claude-3-5-sonnet-20240620-test: cmd: echo "Scoring";[[ "anthropic/claude-3-5-sonnet-20240620" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620; @@ -885,13 +885,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver - -T scorer_model=google/gemini-2.5-flash -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 + -T scorer_model=google/gemini-3-flash-preview -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openai_deep_research.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval @@ -943,7 +943,7 @@ stages: sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver --model openai/o4-mini --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false --reasoning-history - none -T split=test -T scorer_model=google/gemini-2.5-pro -T excerpt_prompt=False + none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname "openai/o4-mini"); mv "$(ls -t test_dvc_logs/solver_outputs/*openai/o4-mini.eval @@ -951,7 +951,7 @@ stages: params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval @@ -1017,18 +1017,18 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=test - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 1fc7402e0fd42a53863c6a6f859b9fa8 - size: 4895087 + md5: 16c2ebb4e4568a24b97b1742ecb517fc + size: 4091878 score_all_solvers@elicit-test: cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; @@ -1208,57 +1208,56 @@ stages: md5: 26bbc824e8055613e796a159b16932af size: 1171464 solve_memorized@model0-dev: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver - -T scorer_model=google/gemini-2.5-pro -T split=dev -S sys_name_or_path=fhouse_crow - -S require_snippets=false --limit=1000 --retry-on-error=10 --log-shared --no-score; - mv "$(ls -t dev_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head - -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S sys_name_or_path=openai_deep_research --limit=1000 + --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*openai_deep_research.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 4ff53a9992d562521979130e3bb36fcf - size: 1195651 + md5: b201f2f7bd9c83f6e3c901805f385d44 + size: 1195688 solve_perplexity_dr@test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver astabench/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=test - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 253e478e5891269aaf0b598a9320fff9 - size: 1784027 + md5: c1ba04199b4b8b0b3adefccb02e22956 + size: 2027391 solve_perplexity_dr@dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver astabench/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=dev - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 13761e458ca6d9fd68b66975a2b1c3be - size: 1187381 + md5: 8965cf43f6dc2e9a401b4d286637445d + size: 845996 solve_memorized@model2-dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir @@ -2180,18 +2179,18 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' - -T split=test -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_you.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: e7be9b73b33e99893b170afb853a21ae - size: 1731090 + md5: d3d528dd45b27e5b9508a0087d978d87 + size: 1626349 create_nice_logs@google/gemini-2.5-pro-preview-03-25-test: cmd: echo "Creating logs"; [[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "google/gemini-2.5-pro-preview-03-25"); @@ -2269,13 +2268,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' - -T split=dev -T scorer_model=google/gemini-2.5-pro --limit=1000 --log-shared + -T split=dev -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 @@ -2429,18 +2428,18 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver - -T split=test -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: 21685170723948d1f8d89cf0ed71666d - size: 5802900 + md5: e9f0f203bcfba80faf017d7545d35ed6 + size: 4914841 create_nice_logs@anthropic/claude-sonnet-4-20250514-test: cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514"); @@ -3639,19 +3638,19 @@ stages: solve_scispace@test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=google/gemini-2.5-flash + --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*scispace.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: 8726b423e371a6f02ebb8edf044da131 - size: 13192395 + md5: e517fa1055570efb347dd6a924e478ae + size: 46039197 log_any_remaining_errors_and_record_scores@sqa_o3_high-test: cmd: echo "Collecting errors";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_o3_high"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_o3_high"); @@ -3708,13 +3707,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py - -T scorer_model=google/gemini-2.5-flash -T split=test -S max_wait_time=900 -S - agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls + -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 + -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval @@ -3725,20 +3724,20 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py - -T scorer_model=google/gemini-2.5-flash -T split=test -S max_wait_time=900 -S - agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls - -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" + -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 + -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv + "$(ls -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head + -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: 1444e580204b243a0ca753f3c13bb97e + size: 8435874 solve_sqa@o3_high-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -3760,13 +3759,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/openscholar/memorized_solver.py -S path=astabench/solvers/sqa/openscholar/openscholar_cache_test.json - -T scorer_model=google/gemini-2.5-flash -T split=test --limit=1000 --retry-on-error=10 + -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openscholar.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 From 36864d72ac4a5227b4588846a313fb8c751fd2a1 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Mon, 16 Mar 2026 20:45:18 +0000 Subject: [PATCH 02/13] Fix DVC solve runtime and refresh lockfile --- dvc.lock | 152 +++++++++++++++++++++++++++++-------------------- dvc.yaml | 42 +++++++------- pyproject.toml | 6 +- 3 files changed, 115 insertions(+), 85 deletions(-) diff --git a/dvc.lock b/dvc.lock index 6811b150..94d86627 100644 --- a/dvc.lock +++ b/dvc.lock @@ -816,71 +816,67 @@ stages: md5: 10496d398144a7a41e639823823d007f size: 1068696 solve_llm@model4-test: - cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-3-7-sonnet-20250219 - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-3-7-sonnet-20250219 -T split=test -T scorer_model=google/gemini-2.5-pro + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o3 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model openai/o3 --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false + --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "anthropic/claude-3-7-sonnet-20250219" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "anthropic/claude-3-7-sonnet-20250219"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-3-7-sonnet-20250219.eval - 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval" + [[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "openai/o3"); mv "$(ls -t test_dvc_logs/solver_outputs/*openai/o3.eval 2>/dev/null + | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval hash: md5 - md5: a8fbef7fa3689f6859d79611e64981ed - size: 1850390 + md5: 5857c63cdab2b62a6f907c9a27a811ec + size: 17943556 solve_llm@model1-test: cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-sonnet-4-20250514 --reasoning-tokens 8192 -T split=test - -T scorer_model=google/gemini-2.5-pro -T excerpt_prompt=False --limit=1000 --retry-on-error=10 - --log-shared --no-score; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && - mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514"); - mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-20250514-thinking.eval + INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6-thinking + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model anthropic/claude-sonnet-4-6 --reasoning-tokens 8192 -T split=test -T + scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 + --retry-on-error=10 --log-shared --no-score; [[ "anthropic/claude-sonnet-4-6" + == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "anthropic/claude-sonnet-4-6"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-6-thinking.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval" + "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 16646547ee30427da219b0983ca788de - size: 1923240 + md5: 3107b1f474692f8919a571cee6e6cb3f + size: 14844232 solve_llm@model3-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-20250514 - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-sonnet-4-20250514 -T split=test -T scorer_model=google/gemini-2.5-pro + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6 + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model anthropic/claude-sonnet-4-6 -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "anthropic/claude-sonnet-4-20250514"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-20250514.eval - 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval" + [[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "anthropic/claude-sonnet-4-6"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-6.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: c509c6bcdd7f0692029d00e3f6187139 - size: 8161774 + md5: e521c24cb487f904c7d9aa56e8ab9ff9 + size: 13576196 solve_memorized@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir @@ -917,27 +913,25 @@ stages: md5: 1444e580204b243a0ca753f3c13bb97e size: 8435874 solve_llm@model2-test: - cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_google/gemini-2.5-pro-preview-03-25 - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model google/gemini-2.5-pro-preview-03-25 -T split=test -T scorer_model=google/gemini-2.5-pro + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_google/gemini-3.1-pro-preview + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model google/gemini-3.1-pro-preview -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "google/gemini-2.5-pro-preview-03-25"); mv "$(ls -t test_dvc_logs/solver_outputs/*google/gemini-2.5-pro-preview-03-25.eval + [[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "google/gemini-3.1-pro-preview"); mv "$(ls -t test_dvc_logs/solver_outputs/*google/gemini-3.1-pro-preview.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval" + "test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 8ab9ea1ffa6fc82f71e82a796fae1ab1 - size: 2603396 + md5: 9270f1cbd5f606bd646579fe524ae0f4 + size: 22505579 solve_llm@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -3739,22 +3733,22 @@ stages: md5: 1444e580204b243a0ca753f3c13bb97e size: 8435874 solve_sqa@o3_high-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/sqa.py@sqa_solver -T split=test -T scorer_model=google/gemini-2.5-flash - -S completion_model=o3_high --limit=1000 --retry-on-error=10 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: 36aa067ea0463775e8fb73f44750fbef + size: 6508188 solve_openscholar@test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -3848,3 +3842,37 @@ stages: hash: md5 md5: 86bd8ce6f2e7ceb394d8430ff43a6e37 size: 1372454 + solve_sqa@claude-4.6-test: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*claude-4.6.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 88e767e27d8d5a843dc16f5d5744f7d3 + size: 1576969 + solve_sqa@gemini-3.1-pro-preview-test: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 0c00ce90d2d86bd1aadbc1743aea07a9 + size: 1576951 diff --git a/dvc.yaml b/dvc.yaml index e0afd4e5..cfa5c8de 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -23,10 +23,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_${item.model} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/sqa.py@sqa_solver + --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=${item.split} -T scorer_model=${scorer_model} -S completion_model=${item.model} @@ -50,10 +50,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver + --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -74,10 +74,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -98,12 +98,12 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ -T with_search_tools=False --model 'perplexity/sonar-deep-research' - --solver astabench/solvers/sqa/formatted_perplexity.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -124,10 +124,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm - uv run --extra storm --python 3.11 + uv run --project agent-baselines/solvers/storm --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/storm_solver.py@storm_solver + --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -148,10 +148,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace - uv run + uv run --project agent-baselines inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=${scorer_model} -T split=${item.split} --limit=${limit} @@ -174,11 +174,11 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/openscholar/memorized_solver.py - -S path=astabench/solvers/sqa/openscholar/openscholar_cache_${item}.json + --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + -S path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_${item}.json -T scorer_model=${scorer_model} -T split=${item} --limit=${limit} @@ -217,10 +217,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.llm_name}${item.model.suffix} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model ${item.model.llm_name} ${item.model.llm_args} -T split=${item.split} -T scorer_model=${scorer_model} @@ -251,10 +251,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra futurehouse + uv run --project agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/futurehouse/futurehouse_solver.py + --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=${scorer_model} -T split=${item.split} -S max_wait_time=900 ${item.model.solver_args} @@ -281,10 +281,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver -T scorer_model=${scorer_model} -T split=${item.split} -S sys_name_or_path=${item.model.name} ${item.model.solver_args} diff --git a/pyproject.toml b/pyproject.toml index 85500ee9..a3ea681b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ version = "0.5.0" readme = "README.md" requires-python = ">=3.11" dependencies = [ - "inspect_ai==0.3.114", + "inspect_ai==0.3.143", "agent-eval==0.1.44", "openai>=1.78.0", # required by inspect "pydantic>=2.11.4", # required by inspect @@ -65,8 +65,10 @@ conflicts = [ [{extra = "storm"}, {extra = "smolagents"}], ] override-dependencies = [ + "inspect_ai==0.3.143", + # sqa pins openai to a lower version than inspect requires - "openai==1.78.0", + "openai==2.28.0", # STORM pretends to require a lower version, but doesn't actually need it: # https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008 From e23d08a53005826cc849f987962af477ce9772e3 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Wed, 18 Mar 2026 21:36:24 +0000 Subject: [PATCH 03/13] Finish test scoring pipeline updates --- dvc.lock | 535 ++++++++++++++++++++++++++----------------------- dvc.yaml | 9 +- pyproject.toml | 2 + 3 files changed, 291 insertions(+), 255 deletions(-) diff --git a/dvc.lock b/dvc.lock index 94d86627..3ae27191 100644 --- a/dvc.lock +++ b/dvc.lock @@ -879,8 +879,9 @@ stages: size: 13576196 solve_memorized@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run - --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver + agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver -T scorer_model=google/gemini-3-flash-preview -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openai_deep_research.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval" @@ -933,13 +934,13 @@ stages: md5: 9270f1cbd5f606bd646579fe524ae0f4 size: 22505579 solve_llm@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver --model openai/o4-mini - --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false --reasoning-history - none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False - --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "openai/o4-mini" - == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model openai/o4-mini --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false + --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview + -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; + [[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname "openai/o4-mini"); mv "$(ls -t test_dvc_logs/solver_outputs/*openai/o4-mini.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval" params: @@ -1008,12 +1009,13 @@ stages: md5: 8e144b969d037a4509f5a8321d5f4092 size: 13573997 solve_elicit@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=test - -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval 2>/dev/null - | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver + agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared + --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" params: params.yaml: limit: 1000 @@ -1219,12 +1221,13 @@ stages: md5: b201f2f7bd9c83f6e3c901805f385d44 size: 1195688 solve_perplexity_dr@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver - astabench/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=test - -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ -T with_search_tools=False --model + 'perplexity/sonar-deep-research' --solver + agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared + --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval" params: params.yaml: @@ -2170,11 +2173,11 @@ stages: md5: b45f5856e3195800b6a4f18ab0fa687e size: 4371131 solve_you@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' - -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared - --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver + -S api_type='research' -T split=test -T scorer_model=google/gemini-3-flash-preview + --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_you.eval" params: params.yaml: @@ -2419,9 +2422,9 @@ stages: md5: c527f47dec43ca1a0bec00cb16249e22 size: 7676995 solve_storm@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm - --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --project agent-baselines/solvers/storm + --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" @@ -2806,218 +2809,222 @@ stages: md5: 5982324c37031fd9317260f7b361b476 size: 16622239 score_all_solvers@model12-test: - cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; - cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval + cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_falcon; + cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 8726b423e371a6f02ebb8edf044da131 - size: 13192395 + md5: 1444e580204b243a0ca753f3c13bb97e + size: 8435874 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_storm.eval + - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 459492b0b2a7089f53616778fc2c1c0d + size: 103015187 score_all_solvers@model8-test: - cmd: echo "Scoring";[[ "sqa_claude-4.0" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.0.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval; + cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; + cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.0.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 3a3856bb7ba4c7fd8a6a4503ad86d95e - size: 7334514 + md5: 16c2ebb4e4568a24b97b1742ecb517fc + size: 4091878 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval + - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: f370d1cf06ce6a67539e9958993c426d - size: 33357446 + md5: 51808abf07604ba3a089c67d4cb76ea7 + size: 80175929 score_all_solvers@model11-test: - cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; - cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval + cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow; + cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 1fc7402e0fd42a53863c6a6f859b9fa8 - size: 4895087 + md5: 7ce6ee5a36b336b15929a5e35fc3e795 + size: 1443466 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval + - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: 8d2965af1669a57a7896880bfaac6dc4 + size: 32595013 score_all_solvers@model13-test: - cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace; - cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_scispace.eval + cmd: echo "Scoring";[[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai_deep_research; + cp test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: 0ca0403d04913c8d05c012bb013ed19e + size: 23480196 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval + - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: 7f31f17b142b52f16fad06d77aa4f92d + size: 148101879 score_all_solvers@model14-test: - cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow; - cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval + cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_you; + cp test_dvc_logs/solver_outputs/task_sqa_solver_you.eval test_dvc_logs/scored/task_sqa_solver_you.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_you.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: 7ce6ee5a36b336b15929a5e35fc3e795 - size: 1443466 + md5: d3d528dd45b27e5b9508a0087d978d87 + size: 1626349 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval + - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: d3c2190ee23a647aed113be022cf072a - size: 17219690 + md5: 4bac04abcb7736cedef84ed56f783f14 + size: 44844932 score_all_solvers@model6-test: - cmd: echo "Scoring";[[ "anthropic/claude-3-5-sonnet-20240620" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval; uv run + inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 10496d398144a7a41e639823823d007f - size: 1068696 + md5: 0c00ce90d2d86bd1aadbc1743aea07a9 + size: 1576951 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 5cfa5f8d8daa3baf20773547034ea86f - size: 2103591 + md5: 67835012a4e9586b4bc685cc92d1f1ee + size: 1614108 score_all_solvers@model10-test: - cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace; + cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_scispace.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: e517fa1055570efb347dd6a924e478ae + size: 46039197 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: 65d4fcc288c888ae6d4489ada3e83b98 + size: 188811614 score_all_solvers@model15-test: - cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_falcon; - cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval + cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_perplexity_dr; + cp test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: c1ba04199b4b8b0b3adefccb02e22956 + size: 2027391 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval + - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 27f869df37299d6895881c95fb5a38d9 - size: 78478412 + md5: 5bfdc636401908df6214c3f4007cfb10 + size: 38757428 score_all_solvers@model9-test: - cmd: echo "Scoring";[[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval; + cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; + cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: ab04845b349e04dae643d06d08ab208f - size: 7031013 + md5: e9f0f203bcfba80faf017d7545d35ed6 + size: 4914841 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval + - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: 533fd1f0d777e38f3fe4bf5088bd1b0f - size: 19418222 + md5: b9d05a878d9729fe9e0129e6d06510a0 + size: 110244305 score_all_solvers@model7-test: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; + cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 123be282c9eafe56c554aa9b5b1e995c - size: 10829064 + md5: 36aa067ea0463775e8fb73f44750fbef + size: 6508188 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: c8acd76fb628d608bb72a1f9a6a57572 - size: 29443230 + md5: c17ca82096f3c4174f2f406a7206100b + size: 6837685 score_all_solvers@model16-test: - cmd: echo "Scoring";[[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai_deep_research; - cp test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval + cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; + cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openscholar.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 - md5: e7be9b73b33e99893b170afb853a21ae - size: 1731090 + md5: 2c5b13dc69496592aa3990bbd92cdece + size: 1807380 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval + - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 218f0f063fc75934d9cc6f80d2f663f6 + size: 25936097 score_all_solvers@model9-dev: cmd: echo "Scoring";[[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro; cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval; @@ -3212,7 +3219,7 @@ stages: cmd: echo "Scoring";[[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o4-mini; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval hash: md5 @@ -3220,127 +3227,114 @@ stages: size: 1088436 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 score_all_solvers@model5-test: - cmd: echo "Scoring";[[ "anthropic/claude-3-7-sonnet-20250219" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval; + cmd: echo "Scoring";[[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: a8fbef7fa3689f6859d79611e64981ed - size: 1850390 + md5: 88e767e27d8d5a843dc16f5d5744f7d3 + size: 1576969 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: c284c5abc8f064b1a43eadfb1d743639 - size: 3734749 + md5: 4677c9f160a9eae4cc068a0e83c9aee4 + size: 1614239 score_all_solvers@model3-test: - cmd: echo "Scoring";[[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval - test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + cmd: echo "Scoring";[[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview; + cp test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval; uv + run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 8ab9ea1ffa6fc82f71e82a796fae1ab1 - size: 2603396 + md5: 9270f1cbd5f606bd646579fe524ae0f4 + size: 22505579 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 0b1ca179eb7130488957688c0f1d45b3 - size: 3855948 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 score_all_solvers@model1-test: cmd: echo "Scoring";[[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o3; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval test_dvc_logs/scored/task_sqa_solver_openai/o3.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval hash: md5 - md5: 9b53b134e0185678f65552a598ed13b9 - size: 1558577 + md5: 5857c63cdab2b62a6f907c9a27a811ec + size: 17943556 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 score_all_solvers@model4-test: - cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6; + cp test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval; uv run + inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: c509c6bcdd7f0692029d00e3f6187139 - size: 8161774 + md5: e521c24cb487f904c7d9aa56e8ab9ff9 + size: 13576196 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: 504299ea10b129ab40af8d9842f6f38d - size: 10008164 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 score_all_solvers@model2-test: - cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-20250514-thinking" == */* ]] - && mkdir -p - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking; + cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-6-thinking" == */* ]] && mkdir + -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking; cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval; + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval; uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval deps: - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 16646547ee30427da219b0983ca788de - size: 1923240 + md5: 3107b1f474692f8919a571cee6e6cb3f + size: 14844232 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 976cfd4e91bc6d97827ed4c6264f146c - size: 3797794 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 log_any_remaining_errors_and_record_scores@sqa_claude-4.0-dev: cmd: echo "Collecting errors";[[ "sqa_claude-4.0" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_claude-4.0"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.0"); @@ -3630,11 +3624,13 @@ stages: md5: 071ed9683a227563dfd4630061740e98 size: 258 solve_scispace@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run inspect eval - astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=google/gemini-3-flash-preview - -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls - -t test_dvc_logs/solver_outputs/*scispace.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --project agent-baselines + inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver + agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver + -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 + --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*scispace.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval" params: params.yaml: limit: 1000 @@ -3698,9 +3694,9 @@ stages: md5: eb8e7ffc96608f726ce82345c217179f size: 3783641 solve_futurehouse@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" @@ -3715,9 +3711,9 @@ stages: md5: 7ce6ee5a36b336b15929a5e35fc3e795 size: 1443466 solve_futurehouse@model1-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head @@ -3750,9 +3746,11 @@ stages: md5: 36aa067ea0463775e8fb73f44750fbef size: 6508188 solve_openscholar@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/openscholar/memorized_solver.py -S path=astabench/solvers/sqa/openscholar/openscholar_cache_test.json + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + -S + path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_test.json -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openscholar.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval" @@ -3876,3 +3874,38 @@ stages: hash: md5 md5: 0c00ce90d2d86bd1aadbc1743aea07a9 size: 1576951 + solve_futurehouse@model0-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S max_wait_time=900 + -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls + -t dev_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + hash: md5 + md5: 4ff53a9992d562521979130e3bb36fcf + size: 1195651 + solve_futurehouse@model1-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S max_wait_time=900 + -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv + "$(ls -t dev_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head + -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval + hash: md5 + md5: 2e13a36648af258fdb5bc75d71b00742 + size: 1195707 diff --git a/dvc.yaml b/dvc.yaml index cfa5c8de..8b4317d0 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -317,7 +317,7 @@ stages: # sqa solvers: - name: sqa_claude-4.6 is_retrieverless: false - - name: sqa_gemini-3.1 + - name: sqa_gemini-3.1-pro-preview is_retrieverless: false - name: sqa_o3_high is_retrieverless: false @@ -349,6 +349,7 @@ stages: cp ${item.split}_dvc_logs/solver_outputs/task_sqa_solver_${item.model.name}.eval ${item.split}_dvc_logs/scored/task_sqa_solver_${item.model.name}.eval; uv run inspect score + --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all @@ -373,7 +374,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm @@ -407,7 +408,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm @@ -439,7 +440,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm diff --git a/pyproject.toml b/pyproject.toml index a3ea681b..6d535920 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,6 +66,8 @@ conflicts = [ ] override-dependencies = [ "inspect_ai==0.3.143", + "anthropic==0.85.0", + "google-genai==1.67.0", # sqa pins openai to a lower version than inspect requires "openai==2.28.0", From 9b7b4c932569ae9a0a9b806821ec5c4a69d671fe Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Thu, 19 Mar 2026 17:05:21 +0000 Subject: [PATCH 04/13] Restore shared uv env for DVC pipeline --- dvc.lock | 136 ++++++++++++++++++++++++++----------------------- dvc.yaml | 20 ++++---- pyproject.toml | 10 ++++ 3 files changed, 91 insertions(+), 75 deletions(-) diff --git a/dvc.lock b/dvc.lock index 3ae27191..d585a193 100644 --- a/dvc.lock +++ b/dvc.lock @@ -816,9 +816,9 @@ stages: md5: 10496d398144a7a41e639823823d007f size: 1068696 solve_llm@model4-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o3 uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o3 uv run --extra sqa + inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model openai/o3 --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; @@ -838,8 +838,8 @@ stages: solve_llm@model1-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6-thinking - uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model anthropic/claude-sonnet-4-6 --reasoning-tokens 8192 -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "anthropic/claude-sonnet-4-6" @@ -860,8 +860,8 @@ stages: size: 14844232 solve_llm@model3-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6 - uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model anthropic/claude-sonnet-4-6 -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname @@ -879,8 +879,8 @@ stages: size: 13576196 solve_memorized@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run - --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver -T scorer_model=google/gemini-3-flash-preview -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openai_deep_research.eval @@ -915,8 +915,8 @@ stages: size: 8435874 solve_llm@model2-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_google/gemini-3.1-pro-preview - uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model google/gemini-3.1-pro-preview -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname @@ -934,9 +934,9 @@ stages: md5: 9270f1cbd5f606bd646579fe524ae0f4 size: 22505579 solve_llm@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --extra + sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model openai/o4-mini --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; @@ -1009,9 +1009,9 @@ stages: md5: 8e144b969d037a4509f5a8321d5f4092 size: 13573997 solve_elicit@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --project agent-baselines - --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect + eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval @@ -1221,10 +1221,9 @@ stages: md5: b201f2f7bd9c83f6e3c901805f385d44 size: 1195688 solve_perplexity_dr@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ -T with_search_tools=False --model - 'perplexity/sonar-deep-research' --solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra + sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval @@ -2173,9 +2172,9 @@ stages: md5: b45f5856e3195800b6a4f18ab0fa687e size: 4371131 solve_you@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --project agent-baselines - --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect + eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_you.eval" @@ -2422,9 +2421,9 @@ stages: md5: c527f47dec43ca1a0bec00cb16249e22 size: 7676995 solve_storm@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --project agent-baselines/solvers/storm - --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm + --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" @@ -2831,8 +2830,9 @@ stages: score_all_solvers@model8-test: cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 @@ -2911,7 +2911,7 @@ stages: cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval; uv run - inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval @@ -2970,8 +2970,9 @@ stages: score_all_solvers@model9-test: cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 @@ -2989,8 +2990,9 @@ stages: score_all_solvers@model7-test: cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 @@ -3218,8 +3220,9 @@ stages: score_all_solvers@model0-test: cmd: echo "Scoring";[[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o4-mini; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval hash: md5 @@ -3237,8 +3240,9 @@ stages: score_all_solvers@model5-test: cmd: echo "Scoring";[[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6; cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval hash: md5 @@ -3257,7 +3261,7 @@ stages: cmd: echo "Scoring";[[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview; cp test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval; uv - run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + run inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval @@ -3276,8 +3280,9 @@ stages: score_all_solvers@model1-test: cmd: echo "Scoring";[[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o3; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval test_dvc_logs/scored/task_sqa_solver_openai/o3.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval hash: md5 @@ -3296,7 +3301,7 @@ stages: cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6; cp test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval; uv run - inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval @@ -3318,8 +3323,9 @@ stages: cp test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval @@ -3624,7 +3630,7 @@ stages: md5: 071ed9683a227563dfd4630061740e98 size: 258 solve_scispace@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --project agent-baselines + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver @@ -3694,9 +3700,9 @@ stages: md5: eb8e7ffc96608f726ce82345c217179f size: 3783641 solve_futurehouse@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project - agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra + futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" @@ -3711,9 +3717,9 @@ stages: md5: 7ce6ee5a36b336b15929a5e35fc3e795 size: 1443466 solve_futurehouse@model1-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project - agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra + futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head @@ -3729,11 +3735,11 @@ stages: md5: 1444e580204b243a0ca753f3c13bb97e size: 8435874 solve_sqa@o3_high-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver - -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high - --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --extra + sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test + -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high --limit=1000 + --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" params: params.yaml: @@ -3746,9 +3752,9 @@ stages: md5: 36aa067ea0463775e8fb73f44750fbef size: 6508188 solve_openscholar@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra + sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py -S path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_test.json -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 @@ -3841,10 +3847,10 @@ stages: md5: 86bd8ce6f2e7ceb394d8430ff43a6e37 size: 1372454 solve_sqa@claude-4.6-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project - agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display - plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver - -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --extra + sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test + -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*claude-4.6.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" params: @@ -3859,8 +3865,8 @@ stages: size: 1576969 solve_sqa@gemini-3.1-pro-preview-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview - uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa - --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" diff --git a/dvc.yaml b/dvc.yaml index 8b4317d0..9a116aad 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -23,7 +23,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_${item.model} - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver @@ -50,7 +50,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver @@ -74,7 +74,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' @@ -98,7 +98,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ -T with_search_tools=False @@ -124,7 +124,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm - uv run --project agent-baselines/solvers/storm --python 3.11 --frozen -- + uv run --extra storm --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver @@ -148,7 +148,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace - uv run --project agent-baselines + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver @@ -174,7 +174,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py @@ -217,7 +217,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.llm_name}${item.model.suffix} - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver @@ -251,7 +251,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --project agent-baselines --extra futurehouse + uv run --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py @@ -281,7 +281,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --project agent-baselines --extra sqa + uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver diff --git a/pyproject.toml b/pyproject.toml index 6d535920..8f37b7af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,15 @@ dev = [ azure = [ "azure-ai-inference" ] +sqa = [ + "agent_baselines[sqa]", +] +futurehouse = [ + "agent_baselines[futurehouse]", +] +storm = [ + "agent_baselines[storm]", +] [project.scripts] astabench = "astabench.cli:cli" @@ -115,6 +124,7 @@ disable = [ ] [tool.uv.sources] +agent_baselines = { path = "agent-baselines" } knowledge-storm = { git = "https://github.com/gituser768/storm", branch = "dh-fix-youcom" } [tool.pytest.ini_options] From 6974253fba89a577708b66bfdbf99521439df803 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Thu, 19 Mar 2026 20:15:33 +0000 Subject: [PATCH 05/13] Avoid local uv source in shared astabench config --- dvc.yaml | 20 ++++++++++---------- pyproject.toml | 10 ---------- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/dvc.yaml b/dvc.yaml index 9a116aad..8b4317d0 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -23,7 +23,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_${item.model} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver @@ -50,7 +50,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver @@ -74,7 +74,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' @@ -98,7 +98,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ -T with_search_tools=False @@ -124,7 +124,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm - uv run --extra storm --python 3.11 + uv run --project agent-baselines/solvers/storm --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver @@ -148,7 +148,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace - uv run --extra sqa + uv run --project agent-baselines inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver @@ -174,7 +174,7 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py @@ -217,7 +217,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.llm_name}${item.model.suffix} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver @@ -251,7 +251,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra futurehouse + uv run --project agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py @@ -281,7 +281,7 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver diff --git a/pyproject.toml b/pyproject.toml index 8f37b7af..6d535920 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,15 +46,6 @@ dev = [ azure = [ "azure-ai-inference" ] -sqa = [ - "agent_baselines[sqa]", -] -futurehouse = [ - "agent_baselines[futurehouse]", -] -storm = [ - "agent_baselines[storm]", -] [project.scripts] astabench = "astabench.cli:cli" @@ -124,7 +115,6 @@ disable = [ ] [tool.uv.sources] -agent_baselines = { path = "agent-baselines" } knowledge-storm = { git = "https://github.com/gituser768/storm", branch = "dh-fix-youcom" } [tool.pytest.ini_options] From d54351289b8a60f6e7e8e33acbda2f6b427bb39a Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Tue, 24 Mar 2026 00:34:06 +0000 Subject: [PATCH 06/13] dvc: invalidate solve_sqa on solver changes --- dvc.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dvc.yaml b/dvc.yaml index 8b4317d0..2b874461 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -39,6 +39,9 @@ stages: - limit - scorer_model - sqa_solver_version + deps: + - agent-baselines/agent_baselines/solvers/sqa/sqa.py + - agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py outs: - ${item.split}_dvc_logs/solver_outputs/task_sqa_solver_sqa_${item.model}.eval From f2fcb7c6cac8de9a78d97622814cb0c988ce8c40 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Tue, 24 Mar 2026 00:48:20 +0000 Subject: [PATCH 07/13] run --- dev_dvc_logs/debug_logs/.gitignore | 9 + dvc.lock | 577 ++++++++++++++++++++++++++--- 2 files changed, 526 insertions(+), 60 deletions(-) diff --git a/dev_dvc_logs/debug_logs/.gitignore b/dev_dvc_logs/debug_logs/.gitignore index d035a97e..5707ecfd 100644 --- a/dev_dvc_logs/debug_logs/.gitignore +++ b/dev_dvc_logs/debug_logs/.gitignore @@ -16,3 +16,12 @@ /task_sqa_solver_openscholar_rubric_eval.csv /task_sqa_solver_openscholar_citation_eval.csv /task_sqa_solver_openscholar_answer_precision_eval.csv +/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv +/task_sqa_solver_sqa_claude-4.6_citation_eval.csv +/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv +/task_sqa_solver_sqa_o3_high_rubric_eval.csv +/task_sqa_solver_sqa_o3_high_citation_eval.csv +/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv diff --git a/dvc.lock b/dvc.lock index d585a193..c3a28d76 100644 --- a/dvc.lock +++ b/dvc.lock @@ -2916,8 +2916,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 0c00ce90d2d86bd1aadbc1743aea07a9 - size: 1576951 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2925,8 +2925,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 67835012a4e9586b4bc685cc92d1f1ee - size: 1614108 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 score_all_solvers@model10-test: cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace; cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval; @@ -2996,8 +2996,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 36aa067ea0463775e8fb73f44750fbef - size: 6508188 + md5: b84fd12987ed62bdff79a241f2c180d5 + size: 11657084 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -3005,8 +3005,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: c17ca82096f3c4174f2f406a7206100b - size: 6837685 + md5: 08e87f32258505b156084691a4777e57 + size: 134542173 score_all_solvers@model16-test: cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; @@ -3066,24 +3066,25 @@ stages: md5: 9f23b21841edbbdbaa9e4a67d4404377 size: 15423266 score_all_solvers@model7-dev: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval deps: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: f7d2bdc765e3dfdac4098687a48acdb2 - size: 6977126 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: ae4fbca2e9345d6573b8dad158a696bc - size: 17947538 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 score_all_solvers@model16-dev: cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_you; cp dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval dev_dvc_logs/scored/task_sqa_solver_you.eval; @@ -3180,24 +3181,25 @@ stages: md5: d555c72e642f5446c964ceec9228a4ce size: 7663908 score_all_solvers@model6-dev: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval; uv run + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval deps: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: f7d2bdc765e3dfdac4098687a48acdb2 - size: 6977126 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: ae4fbca2e9345d6573b8dad158a696bc - size: 17947538 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 score_all_solvers@model14-dev: cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_fhouse_falcon; cp dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval dev_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; @@ -3246,8 +3248,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: 88e767e27d8d5a843dc16f5d5744f7d3 - size: 1576969 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -3255,8 +3257,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: 4677c9f160a9eae4cc068a0e83c9aee4 - size: 1614239 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 score_all_solvers@model3-test: cmd: echo "Scoring";[[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview; cp test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval @@ -3655,8 +3657,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: 08e87f32258505b156084691a4777e57 + size: 134542173 outs: - path: test_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md hash: md5 @@ -3664,7 +3666,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md hash: md5 - md5: 8fd06b654672a9a890c1949d71418507 + md5: 59c6df10258940cad8e0676c0fa662cd size: 255 extract_model_responses@sqa_o3_high-test: cmd: echo "Extracting responses"; [[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname @@ -3673,13 +3675,13 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: b84fd12987ed62bdff79a241f2c180d5 + size: 11657084 outs: - path: test_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv hash: md5 - md5: 3b46e7a92b471a879d146238971303c3 - size: 17090764 + md5: c9744f72c4f17b5fdab08c385566bb9c + size: 16625320 score_all_solvers@model18-test: cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_perplexity_dr; cp test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval; @@ -3735,12 +3737,21 @@ stages: md5: 1444e580204b243a0ca753f3c13bb97e size: 8435874 solve_sqa@o3_high-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test - -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high --limit=1000 - --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 params: params.yaml: limit: 1000 @@ -3749,8 +3760,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 36aa067ea0463775e8fb73f44750fbef - size: 6508188 + md5: b84fd12987ed62bdff79a241f2c180d5 + size: 11657084 solve_openscholar@test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -3847,12 +3858,21 @@ stages: md5: 86bd8ce6f2e7ceb394d8430ff43a6e37 size: 1372454 solve_sqa@claude-4.6-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test - -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*claude-4.6.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 params: params.yaml: limit: 1000 @@ -3861,15 +3881,24 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: 88e767e27d8d5a843dc16f5d5744f7d3 - size: 1576969 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 solve_sqa@gemini-3.1-pro-preview-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 params: params.yaml: limit: 1000 @@ -3878,8 +3907,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 0c00ce90d2d86bd1aadbc1743aea07a9 - size: 1576951 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 solve_futurehouse@model0-dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa @@ -3915,3 +3944,431 @@ stages: hash: md5 md5: 2e13a36648af258fdb5bc75d71b00742 size: 1195707 + solve_sqa@o3_high-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*o3_high.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 + solve_sqa@claude-4.6-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*claude-4.6.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + score_all_solvers@model5-dev: + cmd: echo "Scoring";[[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 + outs: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + create_nice_logs@sqa_claude-4.6-dev: + cmd: echo "Creating logs"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ + task_sqa_solver_sqa_claude-4.6.eval dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv + hash: md5 + md5: 8476eab180a144e5d8b9e243b5b6e4f5 + size: 15194608 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_citation_eval.csv + hash: md5 + md5: 0066c1d4998f7e6c2e511bca70887b9d + size: 91757785 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv + hash: md5 + md5: a870b384bcf04f29fdf3fbb2f080d48f + size: 14844789 + create_nice_logs@sqa_o3_high-dev: + cmd: echo "Creating logs"; [[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv + hash: md5 + md5: ac887eb880a3273d2e5380304046736b + size: 20150178 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_citation_eval.csv + hash: md5 + md5: f530b493d460dc9a11be8abdf6356024 + size: 143239090 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_rubric_eval.csv + hash: md5 + md5: 8be632f12a365cf6840a935576b6577b + size: 19154600 + log_any_remaining_errors_and_record_scores@sqa_o3_high-dev: + cmd: echo "Collecting errors";[[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_o3_high"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_o3_high"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md dev_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md + hash: md5 + md5: 361a16c88aa64c1da7c769091d008b04 + size: 256 + solve_sqa@gemini-3.1-pro-preview-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 + log_any_remaining_errors_and_record_scores@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Collecting errors";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md test_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: a3bc09c12b07468c5f0d55dfa997996c + size: 30307 + - path: test_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: 63bd872fbfe5144f5f03028c2e38f622 + size: 255 + extract_model_responses@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Extracting responses"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && + mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + deps: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 + outs: + - path: + test_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + hash: md5 + md5: 340b4f59929ea22254ac228b59163219 + size: 8492028 + log_any_remaining_errors_and_record_scores@sqa_claude-4.6-dev: + cmd: echo "Collecting errors";[[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_claude-4.6"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.6"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md dev_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: 1325c56a874ce8e4daeaa5e81269b1e4 + size: 256 + create_nice_logs@sqa_o3_high-test: + cmd: echo "Creating logs"; [[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_sqa_o3_high.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: 08e87f32258505b156084691a4777e57 + size: 134542173 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv + hash: md5 + md5: d9d7fad75cba22b0e00cad964a82ee09 + size: 23977952 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_citation_eval.csv + hash: md5 + md5: d027dfeb1e582a083b3aa403a4fdbe0e + size: 184361555 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_rubric_eval.csv + hash: md5 + md5: 6277fc48ce5a9d1c6a3a8336f0b8fe2c + size: 22253619 + extract_model_responses@sqa_claude-4.6-dev: + cmd: echo "Extracting responses"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + outs: + - path: dev_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + hash: md5 + md5: 215866cacf7a1a8d777d33e83cdc0d42 + size: 10190157 + extract_model_responses@sqa_claude-4.6-test: + cmd: echo "Extracting responses"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + test_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + deps: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 + outs: + - path: test_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + hash: md5 + md5: 11056e2841066f910c9a20cdbe57c2d6 + size: 10108274 + create_nice_logs@sqa_claude-4.6-test: + cmd: echo "Creating logs"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ + task_sqa_solver_sqa_claude-4.6.eval test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv + hash: md5 + md5: a7c482283024eac90b33975e5e833c2f + size: 15052564 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_citation_eval.csv + hash: md5 + md5: 53d094e655f6e31c4f0e4e1949a2cc1f + size: 90802762 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv + hash: md5 + md5: 9fea6589050fd031ea658b5f57ab9f4f + size: 14660996 + create_nice_logs@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Creating logs"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 202667a438fa88c60c851f1435d6cc74 + size: 12993016 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: d80c9bde6669317c8e90377d2c9e39bf + size: 70733966 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: a323039768948c6620b3bc5c0ddf1562 + size: 12781983 + log_any_remaining_errors_and_record_scores@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Collecting errors";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md dev_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: 8724eaf86f64b1311df733a31b2a23f2 + size: 45811 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: d813e4e2728ca1c9c63e46ad793fc82b + size: 255 + extract_model_responses@sqa_o3_high-dev: + cmd: echo "Extracting responses"; [[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 + outs: + - path: dev_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv + hash: md5 + md5: c789e14f0b3ec9e09b189723a27c4203 + size: 13934544 + create_nice_logs@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Creating logs"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 41a0931d5caba776709fa2a8e139999f + size: 14292940 + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: 2aea34650fcf7c3888592a6b4608484c + size: 79977272 + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: 7067909219048d538c53daada5a0c6e3 + size: 13789837 + extract_model_responses@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Extracting responses"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && + mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 + outs: + - path: + dev_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + hash: md5 + md5: 9a275b862ae580682e776e3765906097 + size: 9171862 + log_any_remaining_errors_and_record_scores@sqa_claude-4.6-test: + cmd: echo "Collecting errors";[[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_claude-4.6"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.6"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + test_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md test_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: 94a1c1680717a158e7f16dfc642bc55c + size: 15167 + - path: test_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: e597800f5c4eac984aad9a6f16bef594 + size: 255 From d70637681d7b7a3fcb0d9d969a9edd7266efe212 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Fri, 27 Mar 2026 16:16:40 +0000 Subject: [PATCH 08/13] pull eval files from hf --- dvc.lock | 93 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/dvc.lock b/dvc.lock index c3a28d76..5f3d99ab 100644 --- a/dvc.lock +++ b/dvc.lock @@ -879,8 +879,8 @@ stages: size: 13576196 solve_memorized@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run - --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver + --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver -T scorer_model=google/gemini-3-flash-preview -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openai_deep_research.eval @@ -893,8 +893,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0ca0403d04913c8d05c012bb013ed19e - size: 23480196 + md5: e64efa83d59f1e39f5f44f7472b7f92d + size: 20770084 solve_memorized@model1-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -1009,9 +1009,9 @@ stages: md5: 8e144b969d037a4509f5a8321d5f4092 size: 13573997 solve_elicit@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval @@ -1023,8 +1023,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 16c2ebb4e4568a24b97b1742ecb517fc - size: 4091878 + md5: eaf8f155d177afe6c3fb8b5913324458 + size: 15975606 score_all_solvers@elicit-test: cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; @@ -1221,9 +1221,10 @@ stages: md5: b201f2f7bd9c83f6e3c901805f385d44 size: 1195688 solve_perplexity_dr@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ -T with_search_tools=False --model + 'perplexity/sonar-deep-research' --solver agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval @@ -1235,8 +1236,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: c1ba04199b4b8b0b3adefccb02e22956 - size: 2027391 + md5: 4b19505fd7cab3fd73e60f065c57ac7c + size: 52745393 solve_perplexity_dr@dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ @@ -2172,9 +2173,9 @@ stages: md5: b45f5856e3195800b6a4f18ab0fa687e size: 4371131 solve_you@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_you.eval" @@ -2185,8 +2186,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: d3d528dd45b27e5b9508a0087d978d87 - size: 1626349 + md5: dffe67db7f27087923ec47f447e5006f + size: 3981342 create_nice_logs@google/gemini-2.5-pro-preview-03-25-test: cmd: echo "Creating logs"; [[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "google/gemini-2.5-pro-preview-03-25"); @@ -2421,9 +2422,9 @@ stages: md5: c527f47dec43ca1a0bec00cb16249e22 size: 7676995 solve_storm@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm - --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --project agent-baselines/solvers/storm + --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" @@ -2434,8 +2435,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: e9f0f203bcfba80faf017d7545d35ed6 - size: 4914841 + md5: 2a3a5781d41b7d77ade04fe0cc6e8347 + size: 15065365 create_nice_logs@anthropic/claude-sonnet-4-20250514-test: cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514"); @@ -3632,7 +3633,7 @@ stages: md5: 071ed9683a227563dfd4630061740e98 size: 258 solve_scispace@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --extra sqa + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --project agent-baselines inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver @@ -3647,8 +3648,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: 31ee4f9ea8f0301cb6bd65cdc5cc0d23 + size: 33097436 log_any_remaining_errors_and_record_scores@sqa_o3_high-test: cmd: echo "Collecting errors";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_o3_high"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_o3_high"); @@ -3702,9 +3703,9 @@ stages: md5: eb8e7ffc96608f726ce82345c217179f size: 3783641 solve_futurehouse@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" @@ -3716,12 +3717,12 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 7ce6ee5a36b336b15929a5e35fc3e795 - size: 1443466 + md5: d3c2190ee23a647aed113be022cf072a + size: 17219690 solve_futurehouse@model1-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head @@ -3734,8 +3735,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 1444e580204b243a0ca753f3c13bb97e - size: 8435874 + md5: 27f869df37299d6895881c95fb5a38d9 + size: 78478412 solve_sqa@o3_high-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display @@ -3746,8 +3747,8 @@ stages: deps: - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py hash: md5 - md5: 45aeed65c5f153adcacc3d447ea98238 - size: 9957 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py hash: md5 md5: a127d3e76be6bb990bf79d0a610c9266 @@ -3760,12 +3761,12 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: b84fd12987ed62bdff79a241f2c180d5 - size: 11657084 + md5: 711b54ac018c9dc4f2cc190c3029c42e + size: 56666173 solve_openscholar@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py -S path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_test.json -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 @@ -3778,8 +3779,8 @@ stages: outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 - md5: 2c5b13dc69496592aa3990bbd92cdece - size: 1807380 + md5: 61ff1d386b502863144b85cc37a0b0a4 + size: 8231693 score_all_solvers@model19-test: cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; From 7ab988d2e5485eed61f0190ca4a5ac496a2231c0 Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Fri, 27 Mar 2026 16:27:22 +0000 Subject: [PATCH 09/13] force commit --- dvc.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc.lock b/dvc.lock index 5f3d99ab..a91a5095 100644 --- a/dvc.lock +++ b/dvc.lock @@ -3868,8 +3868,8 @@ stages: deps: - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py hash: md5 - md5: 45aeed65c5f153adcacc3d447ea98238 - size: 9957 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py hash: md5 md5: a127d3e76be6bb990bf79d0a610c9266 @@ -3894,8 +3894,8 @@ stages: deps: - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py hash: md5 - md5: 45aeed65c5f153adcacc3d447ea98238 - size: 9957 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py hash: md5 md5: a127d3e76be6bb990bf79d0a610c9266 From 95eaeef35a39fcd17da396a0f158a285bd389d6a Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Tue, 31 Mar 2026 21:32:26 +0000 Subject: [PATCH 10/13] fix retry logic and rerun --- astabench/evals/sqa/retry_utils.py | 5 +- astabench/evals/sqa/rubric.py | 55 ++- dvc.lock | 524 ++++++++++++++++++----------- tests/test_sqa_retry_utils.py | 105 ++++++ 4 files changed, 499 insertions(+), 190 deletions(-) create mode 100644 tests/test_sqa_retry_utils.py diff --git a/astabench/evals/sqa/retry_utils.py b/astabench/evals/sqa/retry_utils.py index c00ee24a..1ad355f4 100644 --- a/astabench/evals/sqa/retry_utils.py +++ b/astabench/evals/sqa/retry_utils.py @@ -4,7 +4,7 @@ import logging import sqlite3 from datetime import datetime, timedelta -from typing import Any, Dict, Optional, Union, List, Tuple +from typing import Any, Callable, Dict, Optional, Union, List, Tuple from inspect_ai.model import Model, GenerateConfig @@ -19,6 +19,7 @@ async def generate_with_retry( max_retries: int = 20, base_delay: float = 2.0, desired_schema=None, + parsed_validator: Optional[Callable[[Dict[str, Any]], None]] = None, ) -> Any: """ Generate response with retry logic and optional JSON parsing. @@ -57,6 +58,8 @@ async def generate_with_retry( if desired_schema: parsed = desired_schema(**parsed) parsed = parsed.model_dump(mode="json") + if parsed_validator: + parsed_validator(parsed) return result, parsed, attempt else: diff --git a/astabench/evals/sqa/rubric.py b/astabench/evals/sqa/rubric.py index 6e03d04b..9e2bd37e 100644 --- a/astabench/evals/sqa/rubric.py +++ b/astabench/evals/sqa/rubric.py @@ -368,6 +368,47 @@ async def _assess_properties_independently(self, response, properties): score_components[x.name] = assessment return score_components, prompt_logs + @staticmethod + def _validate_joint_assessment_payload( + parsed: Dict[str, Any], expected_criteria_count: int + ) -> None: + scores = parsed.get("scores") + if not isinstance(scores, list): + raise ValueError("Joint rubric scorer output is missing a 'scores' list") + + seen_indices = [] + for score in scores: + if not isinstance(score, dict): + raise ValueError( + "Joint rubric scorer output contains a non-object score" + ) + criteria_idx = score.get("criteria_idx") + if not isinstance(criteria_idx, int): + raise ValueError( + "Joint rubric scorer output contains a non-integer criteria_idx" + ) + if criteria_idx < 1 or criteria_idx > expected_criteria_count: + raise ValueError( + f"Joint rubric scorer output contains out-of-range criteria_idx={criteria_idx}" + ) + seen_indices.append(criteria_idx) + + expected_indices = set(range(1, expected_criteria_count + 1)) + actual_indices = set(seen_indices) + if ( + actual_indices != expected_indices + or len(seen_indices) != expected_criteria_count + ): + missing = sorted(expected_indices - actual_indices) + duplicate = sorted( + {idx for idx in seen_indices if seen_indices.count(idx) > 1} + ) + raise ValueError( + "Joint rubric scorer output did not cover every criterion exactly once. " + f"expected={expected_criteria_count} actual={len(seen_indices)} " + f"missing={missing} duplicate={duplicate}" + ) + async def _assess_properties_jointly(self, response, properties): info = { "step_name": "score_property", @@ -461,6 +502,10 @@ async def _assess_properties_jointly(self, response, properties): temperature=self.temperature, top_p=self.top_p, ), + desired_schema=ResponseCriteriaScores, + parsed_validator=lambda parsed: self._validate_joint_assessment_payload( + parsed, len(has_criterion) + ), ) info["system_prompt"] = system_prompt info["user_prompt"] = user_prompt @@ -508,7 +553,15 @@ async def score_output_simplified( ) score_components.update(assessments) - assert set(score_components.keys()) == set(score_weights.keys()) + if set(score_components.keys()) != set(score_weights.keys()): + missing = sorted(set(score_weights.keys()) - set(score_components.keys())) + unexpected = sorted( + set(score_components.keys()) - set(score_weights.keys()) + ) + raise ValueError( + "Simplified rubric scoring produced mismatched components. " + f"missing={missing} unexpected={unexpected}" + ) ann_score = sum( score_weights[key] * score_components[key] for key in score_weights ) diff --git a/dvc.lock b/dvc.lock index a91a5095..ba3ee635 100644 --- a/dvc.lock +++ b/dvc.lock @@ -1362,8 +1362,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai/o4-mini.md hash: md5 @@ -1371,7 +1371,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai/o4-mini.md hash: md5 - md5: 51a6ebb6302c43db2ee01e0271b39377 + md5: cb895dc77c974183e26442188fc736b6 size: 254 log_any_remaining_errors_and_record_scores@google/gemini-2.5-pro-preview-03-25-test: cmd: echo "Collecting errors";[[ "google/gemini-2.5-pro-preview-03-25" == */* @@ -1540,8 +1540,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 outs: - path: test_dvc_logs/errors/task_sqa_solver_elicit.md hash: md5 @@ -1549,7 +1549,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_elicit.md hash: md5 - md5: 049dad5aaf612704ca160fdd49710a3e + md5: d8b5a93675f21c1eb409178be0c641c7 size: 259 log_any_remaining_errors_and_record_scores@storm-dev: cmd: echo "Collecting errors";[[ "storm" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1578,8 +1578,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 outs: - path: test_dvc_logs/errors/task_sqa_solver_scispace.md hash: md5 @@ -1587,7 +1587,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_scispace.md hash: md5 - md5: 3047259800df7c377569f4f7e125ae12 + md5: bd720bd2a3d2187b39848bcf6581ab4f size: 255 log_any_remaining_errors_and_record_scores@fhouse_crow-test: cmd: echo "Collecting errors";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1597,8 +1597,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: c46c37260737c0dc447ea49d194a1646 - size: 16755260 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 outs: - path: test_dvc_logs/errors/task_sqa_solver_fhouse_crow.md hash: md5 @@ -1606,8 +1606,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_fhouse_crow.md hash: md5 - md5: 7d50a7a2afb1c892fd0b9a7172ba4e67 - size: 264 + md5: b9a9b82333f170a027755e52a73305f8 + size: 274 log_any_remaining_errors_and_record_scores@fhouse_falcon-dev: cmd: echo "Collecting errors";[[ "fhouse_falcon" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "fhouse_falcon"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "fhouse_falcon"); @@ -1635,8 +1635,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 outs: - path: test_dvc_logs/errors/task_sqa_solver_fhouse_falcon.md hash: md5 @@ -1644,8 +1644,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_fhouse_falcon.md hash: md5 - md5: 24dfaa60688754273b11987ce25e8050 - size: 265 + md5: 0115f8e85682c50b1fbef8f0fa10d512 + size: 275 log_any_remaining_errors_and_record_scores@openai_deep_research-dev: cmd: echo "Collecting errors";[[ "openai_deep_research" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "openai_deep_research"); mkdir @@ -1675,8 +1675,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0b9c6893461a6a71200faca09d42cf0b - size: 30114985 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai_deep_research.md hash: md5 @@ -1684,7 +1684,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai_deep_research.md hash: md5 - md5: 7edf8195b55073641d2e3baa95574894 + md5: 09fd9c4363a1ad201d5df801e983ef4c size: 255 log_any_remaining_errors_and_record_scores@perplexity_dr-test: cmd: echo "Collecting errors";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1694,8 +1694,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: eb8e7ffc96608f726ce82345c217179f - size: 3783641 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 outs: - path: test_dvc_logs/errors/task_sqa_solver_perplexity_dr.md hash: md5 @@ -1703,8 +1703,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_perplexity_dr.md hash: md5 - md5: 3341f9837634fd03ab2bb537b2b6800b - size: 273 + md5: 5bd74f0abd0a5426f922b5474003d183 + size: 262 extract_model_responses@sqa_claude-3.7-dev: cmd: echo "Extracting responses"; [[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_claude-3.7"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval @@ -2028,24 +2028,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_answer_precision_eval.csv hash: md5 - md5: c8f550d2e3813a2d4bd73428531249cd - size: 27561214 + md5: 6fe55584fb19caabeb2f83b4711affdc + size: 26795074 - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_citation_eval.csv hash: md5 - md5: d76ef432235de34e9508d1327f5604dd - size: 100932205 + md5: 0c5f13a89d433dea784d304ddf5ac621 + size: 101106612 - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_rubric_eval.csv hash: md5 - md5: 9dd49e51140b72b4868d9ef01b4d7eeb - size: 22861958 + md5: b2dfcdb37c2e5795ac0b6c1aee816c22 + size: 22438331 create_nice_logs@storm-dev: cmd: echo "Creating logs"; [[ "storm" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname "storm"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_storm.eval @@ -2078,24 +2078,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 05ff5ed44795c23f8d31b83a559e3ac0 - size: 2964980 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_answer_precision_eval.csv hash: md5 - md5: 989c4fc1aa6b57445b33547bd6bc36a6 - size: 3132753 + md5: 62c13d95dc2f072ce3bd7e1d8444de21 + size: 7565596 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_citation_eval.csv hash: md5 - md5: 6bbd8e26a754b4950a5fdaded0e6fb51 - size: 10221124 + md5: 72ab5aa46f1b01186cb2b1f4b9847daf + size: 20315492 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_rubric_eval.csv hash: md5 - md5: 7d6bd35aa25ce84835aafdf813790d17 - size: 3795058 + md5: f9302743cdca7f064e531d9ae4fed237 + size: 9200221 create_nice_logs@fhouse_falcon-test: cmd: echo "Creating logs"; [[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "fhouse_falcon"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2103,24 +2103,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 0aad0b8d622b09e4ab9812b941beefbc - size: 14628117 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_answer_precision_eval.csv hash: md5 - md5: e64bb7613ce2606912102a937ef42fe7 - size: 13252236 + md5: 37db319201463d9c0d95944f3c36bb10 + size: 33876044 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_citation_eval.csv hash: md5 - md5: 5b15b9771795e7c973a38a2ae3057477 - size: 108591713 + md5: cb0517c0a218b96bacaa16e96a864b4b + size: 347133727 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_rubric_eval.csv hash: md5 - md5: 7a2a72041e33fd16700d4c1d4b731ea0 - size: 12309141 + md5: 174bc8c7cf47e1bc369220fbdede4d24 + size: 33308962 create_nice_logs@openai_deep_research-test: cmd: echo "Creating logs"; [[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openai_deep_research"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2128,25 +2128,25 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0b9c6893461a6a71200faca09d42cf0b - size: 30114985 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_answer_precision_eval.csv hash: md5 - md5: ed42a270972b76b6892fadd423a4d146 - size: 23972174 + md5: a2344ebb69bb62fcb3631e9ec7ebee05 + size: 13769615 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_citation_eval.csv hash: md5 - md5: 152e59ee8bfda0ccc4fd2bb35dbd44a6 - size: 128934273 + md5: cc030bf5e8d82ec17a14fb3f13dd7e6a + size: 74932006 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_rubric_eval.csv hash: md5 - md5: 72484abe43058939f9003f8df89322c0 - size: 22888136 + md5: 6ab5c8a01c4fa86ff90f47051d3cc531 + size: 13566123 create_nice_logs@perplexity_dr-test: cmd: echo "Creating logs"; [[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "perplexity_dr"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2154,24 +2154,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: eb8e7ffc96608f726ce82345c217179f - size: 3783641 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_answer_precision_eval.csv hash: md5 - md5: f8504e5f0c7aef86b9f785c44f1385f2 - size: 2380106 + md5: d9525de6997367f68baf41a9a319e6a9 + size: 21877548 - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_citation_eval.csv hash: md5 - md5: b80d4fa20bac3e0d5dc9e2ba8fe50588 - size: 6825224 + md5: 855e1fbb1fe3b7143bb80b03ebf877b9 + size: 138212606 - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_rubric_eval.csv hash: md5 - md5: b45f5856e3195800b6a4f18ab0fa687e - size: 4371131 + md5: c5eaf89b3a46a1082be673efa7e46acf + size: 19231579 solve_you@test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir @@ -2243,24 +2243,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_you_answer_precision_eval.csv hash: md5 - md5: 3cfa316dbb65a767f448672b4f21f903 - size: 3599826 + md5: 20398962c534bdf847f44d45d5c0c739 + size: 3642395 - path: test_dvc_logs/debug_logs/task_sqa_solver_you_citation_eval.csv hash: md5 - md5: e349b629bc72a358f5518366d3d1ea84 - size: 11149105 + md5: ac233a5d818e4b248bcbf800dcdd0ad4 + size: 11270935 - path: test_dvc_logs/debug_logs/task_sqa_solver_you_rubric_eval.csv hash: md5 - md5: 34c9628df1883bcacca0536d907d162c - size: 5212658 + md5: 577c2c58b07bb00653f527ad115b3f40 + size: 5000608 solve_you@dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ @@ -2357,8 +2357,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 outs: - path: test_dvc_logs/errors/task_sqa_solver_you.md hash: md5 @@ -2366,7 +2366,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_you.md hash: md5 - md5: 92b92fe4376c3460057d3b906a5bc6ec + md5: f9b4f385282e3fb74b73d5711b26c068 size: 255 log_any_remaining_errors_and_record_scores@you-dev: cmd: echo "Collecting errors";[[ "you" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname @@ -2531,25 +2531,25 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_answer_precision_eval.csv hash: md5 - md5: 0967bccd7b4718d2163cfd617c487d18 - size: 2355952 + md5: a564e7ffb71f64a6b574a89dfd6e7025 + size: 2405714 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_citation_eval.csv hash: md5 - md5: 4419d87bb49fd33f4900f16b40c1401b - size: 9399236 + md5: db82fa2dda4031932aa79c4ce837fbe9 + size: 9411475 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_rubric_eval.csv hash: md5 - md5: 536389aee717408941eb0ecc69556cb7 - size: 3063547 + md5: f237d1b0e8447f38e7502ca5c6b423f9 + size: 2876345 create_nice_logs@anthropic/claude-sonnet-4-20250514-thinking-test: cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514-thinking" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514-thinking"); @@ -2772,8 +2772,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: e70ded32703f2c4f02e6d9e6aca2daf6 - size: 15020405 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 outs: - path: test_dvc_logs/errors/task_sqa_solver_storm.md hash: md5 @@ -2781,7 +2781,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_storm.md hash: md5 - md5: ad271c9a6a27255e437aa8f25b128437 + md5: 270eba7766f31f99547266811d9729d4 size: 255 create_nice_logs@storm-test: cmd: echo "Creating logs"; [[ "storm" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname @@ -2790,24 +2790,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: e70ded32703f2c4f02e6d9e6aca2daf6 - size: 15020405 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_answer_precision_eval.csv hash: md5 - md5: b88cd9438a76b2dc64efeec6841a7ab8 - size: 16010341 + md5: a9b310a74ddb497809adef3b347f5758 + size: 15887439 - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_citation_eval.csv hash: md5 - md5: 11476181ed7d144a78881722df1417eb - size: 114417509 + md5: 80d8e2a50b39399da1f213c53a33e7ec + size: 114538286 - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_rubric_eval.csv hash: md5 - md5: 5982324c37031fd9317260f7b361b476 - size: 16622239 + md5: 8700cb63d34eb03a9cca1d20a815aac9 + size: 16106436 score_all_solvers@model12-test: cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_falcon; cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; @@ -2817,8 +2817,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 1444e580204b243a0ca753f3c13bb97e - size: 8435874 + md5: 27f869df37299d6895881c95fb5a38d9 + size: 78478412 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2826,8 +2826,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 459492b0b2a7089f53616778fc2c1c0d - size: 103015187 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 score_all_solvers@model8-test: cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; @@ -2837,8 +2837,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 16c2ebb4e4568a24b97b1742ecb517fc - size: 4091878 + md5: eaf8f155d177afe6c3fb8b5913324458 + size: 15975606 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2846,8 +2846,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: 51808abf07604ba3a089c67d4cb76ea7 - size: 80175929 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 score_all_solvers@model11-test: cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow; cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval; @@ -2857,8 +2857,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 7ce6ee5a36b336b15929a5e35fc3e795 - size: 1443466 + md5: d3c2190ee23a647aed113be022cf072a + size: 17219690 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2866,8 +2866,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 8d2965af1669a57a7896880bfaac6dc4 - size: 32595013 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 score_all_solvers@model13-test: cmd: echo "Scoring";[[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai_deep_research; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval; @@ -2877,8 +2877,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0ca0403d04913c8d05c012bb013ed19e - size: 23480196 + md5: e64efa83d59f1e39f5f44f7472b7f92d + size: 20770084 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2886,8 +2886,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 7f31f17b142b52f16fad06d77aa4f92d - size: 148101879 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 score_all_solvers@model14-test: cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_you; cp test_dvc_logs/solver_outputs/task_sqa_solver_you.eval test_dvc_logs/scored/task_sqa_solver_you.eval; @@ -2897,8 +2897,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: d3d528dd45b27e5b9508a0087d978d87 - size: 1626349 + md5: dffe67db7f27087923ec47f447e5006f + size: 3981342 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2906,8 +2906,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: 4bac04abcb7736cedef84ed56f783f14 - size: 44844932 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 score_all_solvers@model6-test: cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval @@ -2937,8 +2937,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: 31ee4f9ea8f0301cb6bd65cdc5cc0d23 + size: 33097436 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2946,8 +2946,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: 65d4fcc288c888ae6d4489ada3e83b98 - size: 188811614 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 score_all_solvers@model15-test: cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_perplexity_dr; cp test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval; @@ -2957,8 +2957,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: c1ba04199b4b8b0b3adefccb02e22956 - size: 2027391 + md5: 4b19505fd7cab3fd73e60f065c57ac7c + size: 52745393 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2966,8 +2966,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 5bfdc636401908df6214c3f4007cfb10 - size: 38757428 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 score_all_solvers@model9-test: cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; @@ -2977,8 +2977,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: e9f0f203bcfba80faf017d7545d35ed6 - size: 4914841 + md5: 2a3a5781d41b7d77ade04fe0cc6e8347 + size: 15065365 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -2986,8 +2986,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: b9d05a878d9729fe9e0129e6d06510a0 - size: 110244305 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 score_all_solvers@model7-test: cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; @@ -2997,8 +2997,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: b84fd12987ed62bdff79a241f2c180d5 - size: 11657084 + md5: 711b54ac018c9dc4f2cc190c3029c42e + size: 56666173 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -3006,8 +3006,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 08e87f32258505b156084691a4777e57 - size: 134542173 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 score_all_solvers@model16-test: cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; @@ -3017,8 +3017,8 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 - md5: 2c5b13dc69496592aa3990bbd92cdece - size: 1807380 + md5: 61ff1d386b502863144b85cc37a0b0a4 + size: 8231693 params: params.yaml: scorer_model: google/gemini-3-flash-preview @@ -3026,8 +3026,8 @@ stages: outs: - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 218f0f063fc75934d9cc6f80d2f663f6 - size: 25936097 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 score_all_solvers@model9-dev: cmd: echo "Scoring";[[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro; cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval; @@ -3389,24 +3389,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_answer_precision_eval.csv hash: md5 - md5: ae2535020abf63dff6dbe2f7519bba65 - size: 24127154 + md5: 2da6012bc261ba26a6f22e70451fef2a + size: 24025759 - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_citation_eval.csv hash: md5 - md5: cc449723abafe0752eaf4bc20ec113f4 - size: 629170068 + md5: 6e89b26a8211c3c26efe1573131470f9 + size: 629204683 - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_rubric_eval.csv hash: md5 - md5: dab63ec769768be0927c4bd6a4c8b6e5 - size: 25049300 + md5: 88497a000bf73df14edbb1a397d9617e + size: 24536842 create_nice_logs@sqa_gemini-2.5-pro-test: cmd: echo "Creating logs"; [[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-2.5-pro"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -3475,8 +3475,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai/o3.md hash: md5 @@ -3484,8 +3484,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai/o3.md hash: md5 - md5: 57a8d34df60af9a3aedf83c87fdf31ae - size: 248 + md5: dec5c11490332da95212a50822ffedcd + size: 255 create_nice_logs@openai/o3-test: cmd: echo "Creating logs"; [[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openai/o3"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_openai/o3.eval @@ -3493,24 +3493,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_answer_precision_eval.csv hash: md5 - md5: d08cc53367e91e9d19af2aa3a739116f - size: 3357861 + md5: 0aedf1e52fdede7fcf728bacc7d3d979 + size: 3070960 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_citation_eval.csv hash: md5 - md5: 2d41fbaa6791b963d1d52a6618071b45 - size: 15846570 + md5: 0eeabb899a7e0bf467de2c1380d4be07 + size: 12740876 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_rubric_eval.csv hash: md5 - md5: 32209a17d1f07ba85c4dd76b2662cf0b - size: 3989258 + md5: c1c777c964929e2aba2d034a56629762 + size: 4696231 score_all_solvers@model17-test: cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_you; cp test_dvc_logs/solver_outputs/task_sqa_solver_you.eval test_dvc_logs/scored/task_sqa_solver_you.eval; @@ -3658,8 +3658,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 08e87f32258505b156084691a4777e57 - size: 134542173 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 outs: - path: test_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md hash: md5 @@ -3667,7 +3667,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md hash: md5 - md5: 59c6df10258940cad8e0676c0fa662cd + md5: 68db553d9f2385a47104ce032b7d4ba9 size: 255 extract_model_responses@sqa_o3_high-test: cmd: echo "Extracting responses"; [[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname @@ -3808,8 +3808,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 8b382c2ac78c6d0d4a986ce29d91f81a - size: 4169841 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 outs: - path: test_dvc_logs/errors/task_sqa_solver_openscholar.md hash: md5 @@ -3817,8 +3817,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openscholar.md hash: md5 - md5: 261a9ce3237be6729dbe06cd225a38ee - size: 255 + md5: 751b4ae33c6fbf2ae2c69285488911ce + size: 264 create_nice_logs@openscholar-test: cmd: echo "Creating logs"; [[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openscholar"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_openscholar.eval @@ -3826,24 +3826,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 8b382c2ac78c6d0d4a986ce29d91f81a - size: 4169841 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_answer_precision_eval.csv hash: md5 - md5: fdb84ee559cb4efd25b1eaf1f280b2c8 - size: 2134042 + md5: b265659404c464356fbce005e362e2c6 + size: 3520781 - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_citation_eval.csv hash: md5 - md5: cfecf88227640aeb201c402168f56569 - size: 3689118 + md5: 3af39a804f1471261586bd673b9d41e0 + size: 5538888 - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_rubric_eval.csv hash: md5 - md5: 0f65b999cfd54471555b33f002f8c0c8 - size: 4089765 + md5: 40352702bf118fa9cf475de442681e9b + size: 4863839 extract_model_responses@openscholar-test: cmd: echo "Extracting responses"; [[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname "openscholar"); uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval @@ -4174,24 +4174,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 08e87f32258505b156084691a4777e57 - size: 134542173 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 params: params.yaml: scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv hash: md5 - md5: d9d7fad75cba22b0e00cad964a82ee09 - size: 23977952 + md5: 38c29755259e07ad568d0b20d8c87957 + size: 24595597 - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_citation_eval.csv hash: md5 - md5: d027dfeb1e582a083b3aa403a4fdbe0e - size: 184361555 + md5: b5a199e69a9416dd2c5dee352a9e080a + size: 199803730 - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_rubric_eval.csv hash: md5 - md5: 6277fc48ce5a9d1c6a3a8336f0b8fe2c - size: 22253619 + md5: b54f40c36bc2c406f937860fb41e0d6f + size: 22754983 extract_model_responses@sqa_claude-4.6-dev: cmd: echo "Extracting responses"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_claude-4.6"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval @@ -4373,3 +4373,151 @@ stages: hash: md5 md5: e597800f5c4eac984aad9a6f16bef594 size: 255 + create_nice_logs@anthropic/claude-sonnet-4-6-thinking-test: + cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-6-thinking" == */* ]] + && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + hash: md5 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_answer_precision_eval.csv + hash: md5 + md5: a0e4fb47666252560e09588824a5e9df + size: 5049156 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_citation_eval.csv + hash: md5 + md5: 7e2b481df9feb79f2db794ede95c5c7b + size: 21829229 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_rubric_eval.csv + hash: md5 + md5: e381ec30fc4da34f111af2cfaf415c70 + size: 6143276 + create_nice_logs@anthropic/claude-sonnet-4-6-test: + cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + hash: md5 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_answer_precision_eval.csv + hash: md5 + md5: 9a3e7d57814ac1dfa181843fe90579da + size: 4118799 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_citation_eval.csv + hash: md5 + md5: 231e97ff7e987f086b670826c981f249 + size: 18710123 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_rubric_eval.csv + hash: md5 + md5: d66802329bfa67070d48a3ef983baf6a + size: 5279645 + log_any_remaining_errors_and_record_scores@google/gemini-3.1-pro-preview-test: + cmd: echo "Collecting errors";[[ "google/gemini-3.1-pro-preview" == */* ]] && + mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/errors/task_sqa_solver_google/gemini-3.1-pro-preview.md test_dvc_logs/scores/task_sqa_solver_google/gemini-3.1-pro-preview.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + hash: md5 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_google/gemini-3.1-pro-preview.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_google/gemini-3.1-pro-preview.md + hash: md5 + md5: 1b722f0d3c75681b9268257c29e6c599 + size: 255 + create_nice_logs@google/gemini-3.1-pro-preview-test: + cmd: echo "Creating logs"; [[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + hash: md5 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 9be4c3338c3597fb4edd8acc1c65ede9 + size: 2289533 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: 2da75d6105f760753671ff82b5164610 + size: 6702396 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: b1e5a66e405419e80d56d6fb83d3d9f0 + size: 4027808 + log_any_remaining_errors_and_record_scores@anthropic/claude-sonnet-4-6-thinking-test: + cmd: echo "Collecting errors";[[ "anthropic/claude-sonnet-4-6-thinking" == */* + ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + hash: md5 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + hash: md5 + md5: dabd9b8e8fa8e96191e68cf6b8226f22 + size: 255 + log_any_remaining_errors_and_record_scores@anthropic/claude-sonnet-4-6-test: + cmd: echo "Collecting errors";[[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir + -p test_dvc_logs/errors/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6.md test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + hash: md5 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6.md + hash: md5 + md5: f276952fdb1cba923fd12296aa7e5649 + size: 255 diff --git a/tests/test_sqa_retry_utils.py b/tests/test_sqa_retry_utils.py new file mode 100644 index 00000000..24069979 --- /dev/null +++ b/tests/test_sqa_retry_utils.py @@ -0,0 +1,105 @@ +from types import SimpleNamespace + +import pytest + +from astabench.evals.sqa.retry_utils import generate_with_retry +from astabench.evals.sqa.rubric import RubricCorpusQaGenericMetric + + +class FakeModel: + def __init__(self, completions): + self._completions = iter(completions) + + async def generate(self, prompt_or_messages, config): + return SimpleNamespace(completion=next(self._completions)) + + +@pytest.mark.asyncio +async def test_generate_with_retry_retries_on_parsed_validator_failure(): + model = FakeModel( + [ + '{"scores": [{"criteria_idx": 1}]}', + '{"scores": [{"criteria_idx": 1}, {"criteria_idx": 2}]}', + ] + ) + + def validate(parsed): + indices = [score["criteria_idx"] for score in parsed["scores"]] + if indices != [1, 2]: + raise ValueError(f"incomplete criteria indices: {indices}") + + _, parsed, num_retries = await generate_with_retry( + model=model, + prompt_or_messages=[], + config=SimpleNamespace(), + max_retries=1, + base_delay=0, + parsed_validator=validate, + ) + + assert [score["criteria_idx"] for score in parsed["scores"]] == [1, 2] + assert num_retries == 1 + + +@pytest.mark.asyncio +async def test_joint_rubric_assessment_retries_until_all_criteria_are_scored(): + model = FakeModel( + [ + """{ + "scores": [ + { + "criteria": "criterion 1", + "criteria_idx": 1, + "reasoning": "partial response", + "score": 2, + "evidence": "evidence 1" + } + ] + }""", + """{ + "scores": [ + { + "criteria": "criterion 1", + "criteria_idx": 1, + "reasoning": "covers criterion 1", + "score": 2, + "evidence": "evidence 1" + }, + { + "criteria": "criterion 2", + "criteria_idx": 2, + "reasoning": "covers criterion 2", + "score": 1, + "evidence": "evidence 2" + } + ] + }""", + ] + ) + metric = RubricCorpusQaGenericMetric( + config={ + "question": "Test question", + "ingredients": [ + { + "name": "criterion_a", + "criterion": "Assess criterion A", + "weight": 0.5, + "examples": ["example A"], + }, + { + "name": "criterion_b", + "criterion": "Assess criterion B", + "weight": 0.5, + "examples": ["example B"], + }, + ], + }, + model=model, + ) + + score_components, prompt_logs = await metric._assess_properties_jointly( + "candidate response", metric.config.ingredients + ) + + assert score_components == {"criterion_a": 1.0, "criterion_b": 0.5} + assert prompt_logs["num_retries"][0]["data"]["num_retries"] == 1 From fd5e7c4974ddc5d6e69b53d84843c350c49c8f0e Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Tue, 31 Mar 2026 22:29:23 +0000 Subject: [PATCH 11/13] Refresh scorer lock and override notes --- pyproject.toml | 11 ++++++++--- solvers/scorer/README.md | 2 +- solvers/scorer/pyproject.toml | 3 ++- solvers/scorer/uv.lock | 12 ++++++------ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cc46f468..7bb840a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,15 +65,20 @@ conflicts = [ [{extra = "storm"}, {extra = "smolagents"}], ] override-dependencies = [ + # Shared root-env stability pins. These are not known hard resolver + # conflicts today; they keep the local dev environment on the provider SDK + # versions we have validated with inspect_ai==0.3.143. Revisit on the next + # Inspect/provider bump. "inspect_ai==0.3.143", "anthropic==0.85.0", "google-genai==1.67.0", - # sqa pins openai to a lower version than inspect requires + # litellm currently requires openai>=1.99.5, so keep the shared root env on + # a 2.x SDK even though astabench itself only needs openai>=1.78.0. "openai==2.28.0", - # STORM pretends to require a lower version, but doesn't actually need it: - # https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008 + # Keep the shared env on the same datasets line as astabench. This also + # matches the STORM workflow we have been validating locally. "datasets~=3.2.0", ] diff --git a/solvers/scorer/README.md b/solvers/scorer/README.md index 63281680..6bc5307e 100644 --- a/solvers/scorer/README.md +++ b/solvers/scorer/README.md @@ -10,7 +10,7 @@ This environment is configured in `solvers/scorer/pyproject.toml`. - It installs local `astabench` from this repo. - It pins `inspect_ai==0.3.179`. - It uses a uv override so scoring stays pinned even though local `astabench` - depends on `inspect_ai==0.3.114`. + depends on `inspect_ai==0.3.143`. Install deps with: diff --git a/solvers/scorer/pyproject.toml b/solvers/scorer/pyproject.toml index 14c55bf5..2fbffd9a 100644 --- a/solvers/scorer/pyproject.toml +++ b/solvers/scorer/pyproject.toml @@ -17,7 +17,8 @@ dependencies = [ astabench = { path = "../..", editable = true } [tool.uv] -# Override needed because local astabench pins inspect_ai==0.3.114. +# Override needed because local astabench now pins inspect_ai==0.3.143, while +# the scorer stays on 0.3.179 for broader log-compatibility testing. override-dependencies = [ "inspect_ai==0.3.179", ] diff --git a/solvers/scorer/uv.lock b/solvers/scorer/uv.lock index 4d1a8430..417d5dae 100644 --- a/solvers/scorer/uv.lock +++ b/solvers/scorer/uv.lock @@ -261,7 +261,7 @@ wheels = [ [[package]] name = "astabench" -version = "0.3.1" +version = "0.5.1" source = { editable = "../../" } dependencies = [ { name = "agent-eval" }, @@ -303,9 +303,9 @@ requires-dist = [ { name = "httpx", specifier = "~=0.28.1" }, { name = "httpx-sse", specifier = ">=0.4.2" }, { name = "huggingface-hub" }, - { name = "inspect-ai", specifier = "==0.3.114" }, + { name = "inspect-ai", specifier = "==0.3.143" }, { name = "isort", marker = "extra == 'dev'" }, - { name = "litellm" }, + { name = "litellm", specifier = "==1.75.8" }, { name = "mcp", specifier = "~=1.10" }, { name = "mypy", marker = "extra == 'dev'", specifier = "==1.15" }, { name = "nltk" }, @@ -3116,8 +3116,8 @@ name = "uvicorn" version = "0.41.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click" }, - { name = "h11" }, + { name = "click", marker = "sys_platform != 'emscripten'" }, + { name = "h11", marker = "sys_platform != 'emscripten'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/32/ce/eeb58ae4ac36fe09e3842eb02e0eb676bf2c53ae062b98f1b2531673efdd/uvicorn-0.41.0.tar.gz", hash = "sha256:09d11cf7008da33113824ee5a1c6422d89fbc2ff476540d69a34c87fab8b571a", size = 82633 } wheels = [ @@ -3443,7 +3443,7 @@ name = "zipfile-zstd" version = "0.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zstandard" }, + { name = "zstandard", marker = "python_full_version < '3.14'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f7/2a/2e0941bc0058d10ab37d8c578b94a19f611f6ae54f124140f2fb451f0932/zipfile-zstd-0.0.4.tar.gz", hash = "sha256:c1498e15b7922a3d1af0ea55df8b11b2af4e8f7e0e80e414e25d66899f7def89", size = 4603 } wheels = [ From 5f4b198d9f8e98470f14ea4a6466ba2eea7a6b9a Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Wed, 1 Apr 2026 15:29:37 +0000 Subject: [PATCH 12/13] oops --- pyproject.toml | 9 +- solvers/scorer/pyproject.toml | 3 +- solvers/scorer/uv.lock | 506 +--------------------------------- 3 files changed, 17 insertions(+), 501 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fdfecd05..01d37c2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,14 +65,11 @@ conflicts = [ [{extra = "storm"}, {extra = "smolagents"}], ] override-dependencies = [ - # Hard conflict for the shared root env: SQA-related deps still require the - # older OpenAI SDK line, so keep the repo-level environment pinned here - # until those packages move. + # sqa pins openai to a lower version than inspect requires "openai==1.78.0", - # Compatibility pin for the shared root env; this is not a known hard - # resolver conflict, but it keeps the STORM workflow on the datasets line - # we validate locally. + # STORM pretends to require a lower version, but doesn't actually need it: + # https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008 "datasets~=3.2.0", ] diff --git a/solvers/scorer/pyproject.toml b/solvers/scorer/pyproject.toml index f1fbfdea..14c55bf5 100644 --- a/solvers/scorer/pyproject.toml +++ b/solvers/scorer/pyproject.toml @@ -17,8 +17,7 @@ dependencies = [ astabench = { path = "../..", editable = true } [tool.uv] -# Override needed because local astabench pins inspect_ai==0.3.114, while -# the scorer stays on 0.3.179 for broader log-compatibility testing. +# Override needed because local astabench pins inspect_ai==0.3.114. override-dependencies = [ "inspect_ai==0.3.179", ] diff --git a/solvers/scorer/uv.lock b/solvers/scorer/uv.lock index 0771ba95..4d1a8430 100644 --- a/solvers/scorer/uv.lock +++ b/solvers/scorer/uv.lock @@ -15,7 +15,7 @@ overrides = [{ name = "inspect-ai", specifier = "==0.3.179" }] [[package]] name = "agent-eval" -version = "0.1.46" +version = "0.1.44" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -26,16 +26,9 @@ dependencies = [ { name = "pyarrow" }, { name = "pydantic" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7a/1f/1f56d3659ed70b4d6596e7c6cc41dfe81206f2f0bfe912191fb02109021a/agent_eval-0.1.46.tar.gz", hash = "sha256:c677a1ea778a9b5af7e384232b873d417ca6cd68d97e7b6cd876434c2a53727d", size = 49485 } +sdist = { url = "https://files.pythonhosted.org/packages/5d/29/4f08b7aed93ae47cfa01dc04d824ce20f1e5f83ca4ae5e082a30103de0d2/agent_eval-0.1.44.tar.gz", hash = "sha256:56a6924239ada037a0651d4e9ba21d14d1b3c9dc425d8101972db11c5f90e21f", size = 49213 } wheels = [ - { url = "https://files.pythonhosted.org/packages/7a/5c/06c53c32dd7f589a81614fccd10992ed7e2eaa57fff73ba2f956f8ac6f53/agent_eval-0.1.46-py3-none-any.whl", hash = "sha256:c32a3c29c3ff9328d227fda1cb2d270e84e3dc5243a549961a9c4bf136b4bae8", size = 44313 }, -] - -[package.optional-dependencies] -leaderboard = [ - { name = "matplotlib" }, - { name = "pandas" }, - { name = "seaborn" }, + { url = "https://files.pythonhosted.org/packages/fb/88/097dfb01b3c315a4bf467cacbc64bf7cd39c71b476737c05c084a2e3ec42/agent_eval-0.1.44-py3-none-any.whl", hash = "sha256:c9592a55c549a1ecace72798f301c2189a8131668c27495f40eb479bf839ba81", size = 43769 }, ] [[package]] @@ -268,10 +261,10 @@ wheels = [ [[package]] name = "astabench" -version = "0.5.2" +version = "0.3.1" source = { editable = "../../" } dependencies = [ - { name = "agent-eval", extra = ["leaderboard"] }, + { name = "agent-eval" }, { name = "anthropic" }, { name = "click" }, { name = "datasets" }, @@ -296,7 +289,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "agent-eval", extras = ["leaderboard"], specifier = "==0.1.46" }, + { name = "agent-eval", specifier = "==0.1.44" }, { name = "anthropic", specifier = ">=0.52.0" }, { name = "autoflake", marker = "extra == 'dev'" }, { name = "azure-ai-inference", marker = "extra == 'azure'" }, @@ -312,7 +305,7 @@ requires-dist = [ { name = "huggingface-hub" }, { name = "inspect-ai", specifier = "==0.3.114" }, { name = "isort", marker = "extra == 'dev'" }, - { name = "litellm", specifier = "==1.82.3" }, + { name = "litellm" }, { name = "mcp", specifier = "~=1.10" }, { name = "mypy", marker = "extra == 'dev'", specifier = "==1.15" }, { name = "nltk" }, @@ -580,88 +573,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] -[[package]] -name = "contourpy" -version = "1.3.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773 }, - { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149 }, - { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222 }, - { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234 }, - { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555 }, - { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238 }, - { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218 }, - { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867 }, - { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677 }, - { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234 }, - { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123 }, - { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419 }, - { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979 }, - { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653 }, - { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536 }, - { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397 }, - { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601 }, - { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288 }, - { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386 }, - { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018 }, - { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567 }, - { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655 }, - { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257 }, - { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034 }, - { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672 }, - { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234 }, - { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169 }, - { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859 }, - { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062 }, - { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932 }, - { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024 }, - { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578 }, - { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524 }, - { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730 }, - { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897 }, - { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751 }, - { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486 }, - { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106 }, - { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548 }, - { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297 }, - { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023 }, - { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157 }, - { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570 }, - { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713 }, - { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189 }, - { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251 }, - { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810 }, - { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871 }, - { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264 }, - { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819 }, - { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650 }, - { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833 }, - { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692 }, - { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424 }, - { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300 }, - { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769 }, - { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892 }, - { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748 }, - { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554 }, - { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118 }, - { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555 }, - { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295 }, - { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027 }, - { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428 }, - { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331 }, - { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831 }, - { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809 }, - { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593 }, - { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202 }, - { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207 }, - { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315 }, -] - [[package]] name = "cryptography" version = "46.0.5" @@ -721,15 +632,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bc/58/6b3d24e6b9bc474a2dcdee65dfd1f008867015408a271562e4b690561a4d/cryptography-46.0.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8456928655f856c6e1533ff59d5be76578a7157224dbd9ce6872f25055ab9ab7", size = 3407605 }, ] -[[package]] -name = "cycler" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321 }, -] - [[package]] name = "datasets" version = "3.2.0" @@ -807,58 +709,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896 }, ] -[[package]] -name = "fastuuid" -version = "0.14.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/7d/d9daedf0f2ebcacd20d599928f8913e9d2aea1d56d2d355a93bfa2b611d7/fastuuid-0.14.0.tar.gz", hash = "sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26", size = 18232 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/f3/12481bda4e5b6d3e698fbf525df4443cc7dce746f246b86b6fcb2fba1844/fastuuid-0.14.0-cp311-cp311-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:73946cb950c8caf65127d4e9a325e2b6be0442a224fd51ba3b6ac44e1912ce34", size = 516386 }, - { url = "https://files.pythonhosted.org/packages/59/19/2fc58a1446e4d72b655648eb0879b04e88ed6fa70d474efcf550f640f6ec/fastuuid-0.14.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:12ac85024637586a5b69645e7ed986f7535106ed3013640a393a03e461740cb7", size = 264569 }, - { url = "https://files.pythonhosted.org/packages/78/29/3c74756e5b02c40cfcc8b1d8b5bac4edbd532b55917a6bcc9113550e99d1/fastuuid-0.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:05a8dde1f395e0c9b4be515b7a521403d1e8349443e7641761af07c7ad1624b1", size = 254366 }, - { url = "https://files.pythonhosted.org/packages/52/96/d761da3fccfa84f0f353ce6e3eb8b7f76b3aa21fd25e1b00a19f9c80a063/fastuuid-0.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09378a05020e3e4883dfdab438926f31fea15fd17604908f3d39cbeb22a0b4dc", size = 278978 }, - { url = "https://files.pythonhosted.org/packages/fc/c2/f84c90167cc7765cb82b3ff7808057608b21c14a38531845d933a4637307/fastuuid-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbb0c4b15d66b435d2538f3827f05e44e2baafcc003dd7d8472dc67807ab8fd8", size = 279692 }, - { url = "https://files.pythonhosted.org/packages/af/7b/4bacd03897b88c12348e7bd77943bac32ccf80ff98100598fcff74f75f2e/fastuuid-0.14.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cd5a7f648d4365b41dbf0e38fe8da4884e57bed4e77c83598e076ac0c93995e7", size = 303384 }, - { url = "https://files.pythonhosted.org/packages/c0/a2/584f2c29641df8bd810d00c1f21d408c12e9ad0c0dafdb8b7b29e5ddf787/fastuuid-0.14.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c0a94245afae4d7af8c43b3159d5e3934c53f47140be0be624b96acd672ceb73", size = 460921 }, - { url = "https://files.pythonhosted.org/packages/24/68/c6b77443bb7764c760e211002c8638c0c7cce11cb584927e723215ba1398/fastuuid-0.14.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:2b29e23c97e77c3a9514d70ce343571e469098ac7f5a269320a0f0b3e193ab36", size = 480575 }, - { url = "https://files.pythonhosted.org/packages/5a/87/93f553111b33f9bb83145be12868c3c475bf8ea87c107063d01377cc0e8e/fastuuid-0.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1e690d48f923c253f28151b3a6b4e335f2b06bf669c68a02665bc150b7839e94", size = 452317 }, - { url = "https://files.pythonhosted.org/packages/9e/8c/a04d486ca55b5abb7eaa65b39df8d891b7b1635b22db2163734dc273579a/fastuuid-0.14.0-cp311-cp311-win32.whl", hash = "sha256:a6f46790d59ab38c6aa0e35c681c0484b50dc0acf9e2679c005d61e019313c24", size = 154804 }, - { url = "https://files.pythonhosted.org/packages/9c/b2/2d40bf00820de94b9280366a122cbaa60090c8cf59e89ac3938cf5d75895/fastuuid-0.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:e150eab56c95dc9e3fefc234a0eedb342fac433dacc273cd4d150a5b0871e1fa", size = 156099 }, - { url = "https://files.pythonhosted.org/packages/02/a2/e78fcc5df65467f0d207661b7ef86c5b7ac62eea337c0c0fcedbeee6fb13/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:77e94728324b63660ebf8adb27055e92d2e4611645bf12ed9d88d30486471d0a", size = 510164 }, - { url = "https://files.pythonhosted.org/packages/2b/b3/c846f933f22f581f558ee63f81f29fa924acd971ce903dab1a9b6701816e/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:caa1f14d2102cb8d353096bc6ef6c13b2c81f347e6ab9d6fbd48b9dea41c153d", size = 261837 }, - { url = "https://files.pythonhosted.org/packages/54/ea/682551030f8c4fa9a769d9825570ad28c0c71e30cf34020b85c1f7ee7382/fastuuid-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d23ef06f9e67163be38cece704170486715b177f6baae338110983f99a72c070", size = 251370 }, - { url = "https://files.pythonhosted.org/packages/14/dd/5927f0a523d8e6a76b70968e6004966ee7df30322f5fc9b6cdfb0276646a/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c9ec605ace243b6dbe3bd27ebdd5d33b00d8d1d3f580b39fdd15cd96fd71796", size = 277766 }, - { url = "https://files.pythonhosted.org/packages/16/6e/c0fb547eef61293153348f12e0f75a06abb322664b34a1573a7760501336/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:808527f2407f58a76c916d6aa15d58692a4a019fdf8d4c32ac7ff303b7d7af09", size = 278105 }, - { url = "https://files.pythonhosted.org/packages/2d/b1/b9c75e03b768f61cf2e84ee193dc18601aeaf89a4684b20f2f0e9f52b62c/fastuuid-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2fb3c0d7fef6674bbeacdd6dbd386924a7b60b26de849266d1ff6602937675c8", size = 301564 }, - { url = "https://files.pythonhosted.org/packages/fc/fa/f7395fdac07c7a54f18f801744573707321ca0cee082e638e36452355a9d/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab3f5d36e4393e628a4df337c2c039069344db5f4b9d2a3c9cea48284f1dd741", size = 459659 }, - { url = "https://files.pythonhosted.org/packages/66/49/c9fd06a4a0b1f0f048aacb6599e7d96e5d6bc6fa680ed0d46bf111929d1b/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b9a0ca4f03b7e0b01425281ffd44e99d360e15c895f1907ca105854ed85e2057", size = 478430 }, - { url = "https://files.pythonhosted.org/packages/be/9c/909e8c95b494e8e140e8be6165d5fc3f61fdc46198c1554df7b3e1764471/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3acdf655684cc09e60fb7e4cf524e8f42ea760031945aa8086c7eae2eeeabeb8", size = 450894 }, - { url = "https://files.pythonhosted.org/packages/90/eb/d29d17521976e673c55ef7f210d4cdd72091a9ec6755d0fd4710d9b3c871/fastuuid-0.14.0-cp312-cp312-win32.whl", hash = "sha256:9579618be6280700ae36ac42c3efd157049fe4dd40ca49b021280481c78c3176", size = 154374 }, - { url = "https://files.pythonhosted.org/packages/cc/fc/f5c799a6ea6d877faec0472d0b27c079b47c86b1cdc577720a5386483b36/fastuuid-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:d9e4332dc4ba054434a9594cbfaf7823b57993d7d8e7267831c3e059857cf397", size = 156550 }, - { url = "https://files.pythonhosted.org/packages/a5/83/ae12dd39b9a39b55d7f90abb8971f1a5f3c321fd72d5aa83f90dc67fe9ed/fastuuid-0.14.0-cp313-cp313-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:77a09cb7427e7af74c594e409f7731a0cf887221de2f698e1ca0ebf0f3139021", size = 510720 }, - { url = "https://files.pythonhosted.org/packages/53/b0/a4b03ff5d00f563cc7546b933c28cb3f2a07344b2aec5834e874f7d44143/fastuuid-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:9bd57289daf7b153bfa3e8013446aa144ce5e8c825e9e366d455155ede5ea2dc", size = 262024 }, - { url = "https://files.pythonhosted.org/packages/9c/6d/64aee0a0f6a58eeabadd582e55d0d7d70258ffdd01d093b30c53d668303b/fastuuid-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ac60fc860cdf3c3f327374db87ab8e064c86566ca8c49d2e30df15eda1b0c2d5", size = 251679 }, - { url = "https://files.pythonhosted.org/packages/60/f5/a7e9cda8369e4f7919d36552db9b2ae21db7915083bc6336f1b0082c8b2e/fastuuid-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab32f74bd56565b186f036e33129da77db8be09178cd2f5206a5d4035fb2a23f", size = 277862 }, - { url = "https://files.pythonhosted.org/packages/f0/d3/8ce11827c783affffd5bd4d6378b28eb6cc6d2ddf41474006b8d62e7448e/fastuuid-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e678459cf4addaedd9936bbb038e35b3f6b2061330fd8f2f6a1d80414c0f87", size = 278278 }, - { url = "https://files.pythonhosted.org/packages/a2/51/680fb6352d0bbade04036da46264a8001f74b7484e2fd1f4da9e3db1c666/fastuuid-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e3cc56742f76cd25ecb98e4b82a25f978ccffba02e4bdce8aba857b6d85d87b", size = 301788 }, - { url = "https://files.pythonhosted.org/packages/fa/7c/2014b5785bd8ebdab04ec857635ebd84d5ee4950186a577db9eff0fb8ff6/fastuuid-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:cb9a030f609194b679e1660f7e32733b7a0f332d519c5d5a6a0a580991290022", size = 459819 }, - { url = "https://files.pythonhosted.org/packages/01/d2/524d4ceeba9160e7a9bc2ea3e8f4ccf1ad78f3bde34090ca0c51f09a5e91/fastuuid-0.14.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:09098762aad4f8da3a888eb9ae01c84430c907a297b97166b8abc07b640f2995", size = 478546 }, - { url = "https://files.pythonhosted.org/packages/bc/17/354d04951ce114bf4afc78e27a18cfbd6ee319ab1829c2d5fb5e94063ac6/fastuuid-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:1383fff584fa249b16329a059c68ad45d030d5a4b70fb7c73a08d98fd53bcdab", size = 450921 }, - { url = "https://files.pythonhosted.org/packages/fb/be/d7be8670151d16d88f15bb121c5b66cdb5ea6a0c2a362d0dcf30276ade53/fastuuid-0.14.0-cp313-cp313-win32.whl", hash = "sha256:a0809f8cc5731c066c909047f9a314d5f536c871a7a22e815cc4967c110ac9ad", size = 154559 }, - { url = "https://files.pythonhosted.org/packages/22/1d/5573ef3624ceb7abf4a46073d3554e37191c868abc3aecd5289a72f9810a/fastuuid-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:0df14e92e7ad3276327631c9e7cec09e32572ce82089c55cb1bb8df71cf394ed", size = 156539 }, - { url = "https://files.pythonhosted.org/packages/16/c9/8c7660d1fe3862e3f8acabd9be7fc9ad71eb270f1c65cce9a2b7a31329ab/fastuuid-0.14.0-cp314-cp314-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:b852a870a61cfc26c884af205d502881a2e59cc07076b60ab4a951cc0c94d1ad", size = 510600 }, - { url = "https://files.pythonhosted.org/packages/4c/f4/a989c82f9a90d0ad995aa957b3e572ebef163c5299823b4027986f133dfb/fastuuid-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:c7502d6f54cd08024c3ea9b3514e2d6f190feb2f46e6dbcd3747882264bb5f7b", size = 262069 }, - { url = "https://files.pythonhosted.org/packages/da/6c/a1a24f73574ac995482b1326cf7ab41301af0fabaa3e37eeb6b3df00e6e2/fastuuid-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1ca61b592120cf314cfd66e662a5b54a578c5a15b26305e1b8b618a6f22df714", size = 251543 }, - { url = "https://files.pythonhosted.org/packages/1a/20/2a9b59185ba7a6c7b37808431477c2d739fcbdabbf63e00243e37bd6bf49/fastuuid-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa75b6657ec129d0abded3bec745e6f7ab642e6dba3a5272a68247e85f5f316f", size = 277798 }, - { url = "https://files.pythonhosted.org/packages/ef/33/4105ca574f6ded0af6a797d39add041bcfb468a1255fbbe82fcb6f592da2/fastuuid-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8a0dfea3972200f72d4c7df02c8ac70bad1bb4c58d7e0ec1e6f341679073a7f", size = 278283 }, - { url = "https://files.pythonhosted.org/packages/fe/8c/fca59f8e21c4deb013f574eae05723737ddb1d2937ce87cb2a5d20992dc3/fastuuid-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1bf539a7a95f35b419f9ad105d5a8a35036df35fdafae48fb2fd2e5f318f0d75", size = 301627 }, - { url = "https://files.pythonhosted.org/packages/cb/e2/f78c271b909c034d429218f2798ca4e89eeda7983f4257d7865976ddbb6c/fastuuid-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:9a133bf9cc78fdbd1179cb58a59ad0100aa32d8675508150f3658814aeefeaa4", size = 459778 }, - { url = "https://files.pythonhosted.org/packages/1e/f0/5ff209d865897667a2ff3e7a572267a9ced8f7313919f6d6043aed8b1caa/fastuuid-0.14.0-cp314-cp314-musllinux_1_1_i686.whl", hash = "sha256:f54d5b36c56a2d5e1a31e73b950b28a0d83eb0c37b91d10408875a5a29494bad", size = 478605 }, - { url = "https://files.pythonhosted.org/packages/e0/c8/2ce1c78f983a2c4987ea865d9516dbdfb141a120fd3abb977ae6f02ba7ca/fastuuid-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:ec27778c6ca3393ef662e2762dba8af13f4ec1aaa32d08d77f71f2a70ae9feb8", size = 450837 }, - { url = "https://files.pythonhosted.org/packages/df/60/dad662ec9a33b4a5fe44f60699258da64172c39bd041da2994422cdc40fe/fastuuid-0.14.0-cp314-cp314-win32.whl", hash = "sha256:e23fc6a83f112de4be0cc1990e5b127c27663ae43f866353166f87df58e73d06", size = 154532 }, - { url = "https://files.pythonhosted.org/packages/1f/f6/da4db31001e854025ffd26bc9ba0740a9cbba2c3259695f7c5834908b336/fastuuid-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:df61342889d0f5e7a32f7284e55ef95103f2110fee433c2ae7c2c0956d76ac8a", size = 156457 }, -] - [[package]] name = "filelock" version = "3.24.3" @@ -868,55 +718,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/0f/5d0c71a1aefeb08efff26272149e07ab922b64f46c63363756224bd6872e/filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d", size = 24331 }, ] -[[package]] -name = "fonttools" -version = "4.62.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/9a/08/7012b00a9a5874311b639c3920270c36ee0c445b69d9989a85e5c92ebcb0/fonttools-4.62.1.tar.gz", hash = "sha256:e54c75fd6041f1122476776880f7c3c3295ffa31962dc6ebe2543c00dca58b5d", size = 3580737 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/88/39/23ff32561ec8d45a4d48578b4d241369d9270dc50926c017570e60893701/fonttools-4.62.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:40975849bac44fb0b9253d77420c6d8b523ac4dcdcefeff6e4d706838a5b80f7", size = 2871039 }, - { url = "https://files.pythonhosted.org/packages/24/7f/66d3f8a9338a9b67fe6e1739f47e1cd5cee78bd3bc1206ef9b0b982289a5/fonttools-4.62.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9dde91633f77fa576879a0c76b1d89de373cae751a98ddf0109d54e173b40f14", size = 2416346 }, - { url = "https://files.pythonhosted.org/packages/aa/53/5276ceba7bff95da7793a07c5284e1da901cf00341ce5e2f3273056c0cca/fonttools-4.62.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6acb4109f8bee00fec985c8c7afb02299e35e9c94b57287f3ea542f28bd0b0a7", size = 5100897 }, - { url = "https://files.pythonhosted.org/packages/cc/a1/40a5c4d8e28b0851d53a8eeeb46fbd73c325a2a9a165f290a5ed90e6c597/fonttools-4.62.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1c5c25671ce8805e0d080e2ffdeca7f1e86778c5cbfbeae86d7f866d8830517b", size = 5071078 }, - { url = "https://files.pythonhosted.org/packages/e3/be/d378fca4c65ea1956fee6d90ace6e861776809cbbc5af22388a090c3c092/fonttools-4.62.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a5d8825e1140f04e6c99bb7d37a9e31c172f3bc208afbe02175339e699c710e1", size = 5076908 }, - { url = "https://files.pythonhosted.org/packages/f8/d9/ae6a1d0693a4185a84605679c8a1f719a55df87b9c6e8e817bfdd9ef5936/fonttools-4.62.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:268abb1cb221e66c014acc234e872b7870d8b5d4657a83a8f4205094c32d2416", size = 5202275 }, - { url = "https://files.pythonhosted.org/packages/54/6c/af95d9c4efb15cabff22642b608342f2bd67137eea6107202d91b5b03184/fonttools-4.62.1-cp311-cp311-win32.whl", hash = "sha256:942b03094d7edbb99bdf1ae7e9090898cad7bf9030b3d21f33d7072dbcb51a53", size = 2293075 }, - { url = "https://files.pythonhosted.org/packages/d3/97/bf54c5b3f2be34e1f143e6db838dfdc54f2ffa3e68c738934c82f3b2a08d/fonttools-4.62.1-cp311-cp311-win_amd64.whl", hash = "sha256:e8514f4924375f77084e81467e63238b095abda5107620f49421c368a6017ed2", size = 2344593 }, - { url = "https://files.pythonhosted.org/packages/47/d4/dbacced3953544b9a93088cc10ef2b596d348c983d5c67a404fa41ec51ba/fonttools-4.62.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:90365821debbd7db678809c7491ca4acd1e0779b9624cdc6ddaf1f31992bf974", size = 2870219 }, - { url = "https://files.pythonhosted.org/packages/66/9e/a769c8e99b81e5a87ab7e5e7236684de4e96246aae17274e5347d11ebd78/fonttools-4.62.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:12859ff0b47dd20f110804c3e0d0970f7b832f561630cd879969011541a464a9", size = 2414891 }, - { url = "https://files.pythonhosted.org/packages/69/64/f19a9e3911968c37e1e620e14dfc5778299e1474f72f4e57c5ec771d9489/fonttools-4.62.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c125ffa00c3d9003cdaaf7f2c79e6e535628093e14b5de1dccb08859b680936", size = 5033197 }, - { url = "https://files.pythonhosted.org/packages/9b/8a/99c8b3c3888c5c474c08dbfd7c8899786de9604b727fcefb055b42c84bba/fonttools-4.62.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:149f7d84afca659d1a97e39a4778794a2f83bf344c5ee5134e09995086cc2392", size = 4988768 }, - { url = "https://files.pythonhosted.org/packages/d1/c6/0f904540d3e6ab463c1243a0d803504826a11604c72dd58c2949796a1762/fonttools-4.62.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0aa72c43a601cfa9273bb1ae0518f1acadc01ee181a6fc60cd758d7fdadffc04", size = 4971512 }, - { url = "https://files.pythonhosted.org/packages/29/0b/5cbef6588dc9bd6b5c9ad6a4d5a8ca384d0cea089da31711bbeb4f9654a6/fonttools-4.62.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:19177c8d96c7c36359266e571c5173bcee9157b59cfc8cb0153c5673dc5a3a7d", size = 5122723 }, - { url = "https://files.pythonhosted.org/packages/4a/47/b3a5342d381595ef439adec67848bed561ab7fdb1019fa522e82101b7d9c/fonttools-4.62.1-cp312-cp312-win32.whl", hash = "sha256:a24decd24d60744ee8b4679d38e88b8303d86772053afc29b19d23bb8207803c", size = 2281278 }, - { url = "https://files.pythonhosted.org/packages/28/b1/0c2ab56a16f409c6c8a68816e6af707827ad5d629634691ff60a52879792/fonttools-4.62.1-cp312-cp312-win_amd64.whl", hash = "sha256:9e7863e10b3de72376280b515d35b14f5eeed639d1aa7824f4cf06779ec65e42", size = 2331414 }, - { url = "https://files.pythonhosted.org/packages/3b/56/6f389de21c49555553d6a5aeed5ac9767631497ac836c4f076273d15bd72/fonttools-4.62.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c22b1014017111c401469e3acc5433e6acf6ebcc6aa9efb538a533c800971c79", size = 2865155 }, - { url = "https://files.pythonhosted.org/packages/03/c5/0e3966edd5ec668d41dfe418787726752bc07e2f5fd8c8f208615e61fa89/fonttools-4.62.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:68959f5fc58ed4599b44aad161c2837477d7f35f5f79402d97439974faebfebe", size = 2412802 }, - { url = "https://files.pythonhosted.org/packages/52/94/e6ac4b44026de7786fe46e3bfa0c87e51d5d70a841054065d49cd62bb909/fonttools-4.62.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef46db46c9447103b8f3ff91e8ba009d5fe181b1920a83757a5762551e32bb68", size = 5013926 }, - { url = "https://files.pythonhosted.org/packages/e2/98/8b1e801939839d405f1f122e7d175cebe9aeb4e114f95bfc45e3152af9a7/fonttools-4.62.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6706d1cb1d5e6251a97ad3c1b9347505c5615c112e66047abbef0f8545fa30d1", size = 4964575 }, - { url = "https://files.pythonhosted.org/packages/46/76/7d051671e938b1881670528fec69cc4044315edd71a229c7fd712eaa5119/fonttools-4.62.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2e7abd2b1e11736f58c1de27819e1955a53267c21732e78243fa2fa2e5c1e069", size = 4953693 }, - { url = "https://files.pythonhosted.org/packages/1f/ae/b41f8628ec0be3c1b934fc12b84f4576a5c646119db4d3bdd76a217c90b5/fonttools-4.62.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:403d28ce06ebfc547fbcb0cb8b7f7cc2f7a2d3e1a67ba9a34b14632df9e080f9", size = 5094920 }, - { url = "https://files.pythonhosted.org/packages/f2/f6/53a1e9469331a23dcc400970a27a4caa3d9f6edbf5baab0260285238b884/fonttools-4.62.1-cp313-cp313-win32.whl", hash = "sha256:93c316e0f5301b2adbe6a5f658634307c096fd5aae60a5b3412e4f3e1728ab24", size = 2279928 }, - { url = "https://files.pythonhosted.org/packages/38/60/35186529de1db3c01f5ad625bde07c1f576305eab6d86bbda4c58445f721/fonttools-4.62.1-cp313-cp313-win_amd64.whl", hash = "sha256:7aa21ff53e28a9c2157acbc44e5b401149d3c9178107130e82d74ceb500e5056", size = 2330514 }, - { url = "https://files.pythonhosted.org/packages/36/f0/2888cdac391807d68d90dcb16ef858ddc1b5309bfc6966195a459dd326e2/fonttools-4.62.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fa1d16210b6b10a826d71bed68dd9ec24a9e218d5a5e2797f37c573e7ec215ca", size = 2864442 }, - { url = "https://files.pythonhosted.org/packages/4b/b2/e521803081f8dc35990816b82da6360fa668a21b44da4b53fc9e77efcd62/fonttools-4.62.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:aa69d10ed420d8121118e628ad47d86e4caa79ba37f968597b958f6cceab7eca", size = 2410901 }, - { url = "https://files.pythonhosted.org/packages/00/a4/8c3511ff06e53110039358dbbdc1a65d72157a054638387aa2ada300a8b8/fonttools-4.62.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd13b7999d59c5eb1c2b442eb2d0c427cb517a0b7a1f5798fc5c9e003f5ff782", size = 4999608 }, - { url = "https://files.pythonhosted.org/packages/28/63/cd0c3b26afe60995a5295f37c246a93d454023726c3261cfbb3559969bb9/fonttools-4.62.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8d337fdd49a79b0d51c4da87bc38169d21c3abbf0c1aa9367eff5c6656fb6dae", size = 4912726 }, - { url = "https://files.pythonhosted.org/packages/70/b9/ac677cb07c24c685cf34f64e140617d58789d67a3dd524164b63648c6114/fonttools-4.62.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:d241cdc4a67b5431c6d7f115fdf63335222414995e3a1df1a41e1182acd4bcc7", size = 4951422 }, - { url = "https://files.pythonhosted.org/packages/e6/10/11c08419a14b85b7ca9a9faca321accccc8842dd9e0b1c8a72908de05945/fonttools-4.62.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c05557a78f8fa514da0f869556eeda40887a8abc77c76ee3f74cf241778afd5a", size = 5060979 }, - { url = "https://files.pythonhosted.org/packages/4e/3c/12eea4a4cf054e7ab058ed5ceada43b46809fce2bf319017c4d63ae55bb4/fonttools-4.62.1-cp314-cp314-win32.whl", hash = "sha256:49a445d2f544ce4a69338694cad575ba97b9a75fff02720da0882d1a73f12800", size = 2283733 }, - { url = "https://files.pythonhosted.org/packages/6b/67/74b070029043186b5dd13462c958cb7c7f811be0d2e634309d9a1ffb1505/fonttools-4.62.1-cp314-cp314-win_amd64.whl", hash = "sha256:1eecc128c86c552fb963fe846ca4e011b1be053728f798185a1687502f6d398e", size = 2335663 }, - { url = "https://files.pythonhosted.org/packages/42/c5/4d2ed3ca6e33617fc5624467da353337f06e7f637707478903c785bd8e20/fonttools-4.62.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:1596aeaddf7f78e21e68293c011316a25267b3effdaccaf4d59bc9159d681b82", size = 2947288 }, - { url = "https://files.pythonhosted.org/packages/1f/e9/7ab11ddfda48ed0f89b13380e5595ba572619c27077be0b2c447a63ff351/fonttools-4.62.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:8f8fca95d3bb3208f59626a4b0ea6e526ee51f5a8ad5d91821c165903e8d9260", size = 2449023 }, - { url = "https://files.pythonhosted.org/packages/b2/10/a800fa090b5e8819942e54e19b55fc7c21fe14a08757c3aa3ca8db358939/fonttools-4.62.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee91628c08e76f77b533d65feb3fbe6d9dad699f95be51cf0d022db94089cdc4", size = 5137599 }, - { url = "https://files.pythonhosted.org/packages/37/dc/8ccd45033fffd74deb6912fa1ca524643f584b94c87a16036855b498a1ed/fonttools-4.62.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f37df1cac61d906e7b836abe356bc2f34c99d4477467755c216b72aa3dc748b", size = 4920933 }, - { url = "https://files.pythonhosted.org/packages/99/eb/e618adefb839598d25ac8136cd577925d6c513dc0d931d93b8af956210f0/fonttools-4.62.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:92bb00a947e666169c99b43753c4305fc95a890a60ef3aeb2a6963e07902cc87", size = 5016232 }, - { url = "https://files.pythonhosted.org/packages/d9/5f/9b5c9bfaa8ec82def8d8168c4f13615990d6ce5996fe52bd49bfb5e05134/fonttools-4.62.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:bdfe592802ef939a0e33106ea4a318eeb17822c7ee168c290273cbd5fabd746c", size = 5042987 }, - { url = "https://files.pythonhosted.org/packages/90/aa/dfbbe24c6a6afc5c203d90cc0343e24bcbb09e76d67c4d6eef8c2558d7ba/fonttools-4.62.1-cp314-cp314t-win32.whl", hash = "sha256:b820fcb92d4655513d8402d5b219f94481c4443d825b4372c75a2072aa4b357a", size = 2348021 }, - { url = "https://files.pythonhosted.org/packages/13/6f/ae9c4e4dd417948407b680855c2c7790efb52add6009aaecff1e3bc50e8e/fonttools-4.62.1-cp314-cp314t-win_amd64.whl", hash = "sha256:59b372b4f0e113d3746b88985f1c796e7bf830dd54b28374cd85c2b8acd7583e", size = 2414147 }, - { url = "https://files.pythonhosted.org/packages/fd/ba/56147c165442cc5ba7e82ecf301c9a68353cede498185869e6e02b4c264f/fonttools-4.62.1-py3-none-any.whl", hash = "sha256:7487782e2113861f4ddcc07c3436450659e3caa5e470b27dc2177cade2d8e7fd", size = 1152647 }, -] - [[package]] name = "frozendict" version = "2.4.7" @@ -1559,112 +1360,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437 }, ] -[[package]] -name = "kiwisolver" -version = "1.5.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d0/67/9c61eccb13f0bdca9307614e782fec49ffdde0f7a2314935d489fa93cd9c/kiwisolver-1.5.0.tar.gz", hash = "sha256:d4193f3d9dc3f6f79aaed0e5637f45d98850ebf01f7ca20e69457f3e8946b66a", size = 103482 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/12/dd/a495a9c104be1c476f0386e714252caf2b7eca883915422a64c50b88c6f5/kiwisolver-1.5.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9eed0f7edbb274413b6ee781cca50541c8c0facd3d6fd289779e494340a2b85c", size = 122798 }, - { url = "https://files.pythonhosted.org/packages/11/60/37b4047a2af0cf5ef6d8b4b26e91829ae6fc6a2d1f74524bcb0e7cd28a32/kiwisolver-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3c4923e404d6bcd91b6779c009542e5647fef32e4a5d75e115e3bbac6f2335eb", size = 66216 }, - { url = "https://files.pythonhosted.org/packages/0a/aa/510dc933d87767584abfe03efa445889996c70c2990f6f87c3ebaa0a18c5/kiwisolver-1.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0df54df7e686afa55e6f21fb86195224a6d9beb71d637e8d7920c95cf0f89aac", size = 63911 }, - { url = "https://files.pythonhosted.org/packages/80/46/bddc13df6c2a40741e0cc7865bb1c9ed4796b6760bd04ce5fae3928ef917/kiwisolver-1.5.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2517e24d7315eb51c10664cdb865195df38ab74456c677df67bb47f12d088a27", size = 1438209 }, - { url = "https://files.pythonhosted.org/packages/fd/d6/76621246f5165e5372f02f5e6f3f48ea336a8f9e96e43997d45b240ed8cd/kiwisolver-1.5.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff710414307fefa903e0d9bdf300972f892c23477829f49504e59834f4195398", size = 1248888 }, - { url = "https://files.pythonhosted.org/packages/b2/c1/31559ec6fb39a5b48035ce29bb63ade628f321785f38c384dee3e2c08bc1/kiwisolver-1.5.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6176c1811d9d5a04fa391c490cc44f451e240697a16977f11c6f722efb9041db", size = 1266304 }, - { url = "https://files.pythonhosted.org/packages/5e/ef/1cb8276f2d29cc6a41e0a042f27946ca347d3a4a75acf85d0a16aa6dcc82/kiwisolver-1.5.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50847dca5d197fcbd389c805aa1a1cf32f25d2e7273dc47ab181a517666b68cc", size = 1319650 }, - { url = "https://files.pythonhosted.org/packages/4c/e4/5ba3cecd7ce6236ae4a80f67e5d5531287337d0e1f076ca87a5abe4cd5d0/kiwisolver-1.5.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:01808c6d15f4c3e8559595d6d1fe6411c68e4a3822b4b9972b44473b24f4e679", size = 970949 }, - { url = "https://files.pythonhosted.org/packages/5a/69/dc61f7ae9a2f071f26004ced87f078235b5507ab6e5acd78f40365655034/kiwisolver-1.5.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:f1f9f4121ec58628c96baa3de1a55a4e3a333c5102c8e94b64e23bf7b2083309", size = 2199125 }, - { url = "https://files.pythonhosted.org/packages/e5/7b/abbe0f1b5afa85f8d084b73e90e5f801c0939eba16ac2e49af7c61a6c28d/kiwisolver-1.5.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:b7d335370ae48a780c6e6a6bbfa97342f563744c39c35562f3f367665f5c1de2", size = 2293783 }, - { url = "https://files.pythonhosted.org/packages/8a/80/5908ae149d96d81580d604c7f8aefd0e98f4fd728cf172f477e9f2a81744/kiwisolver-1.5.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:800ee55980c18545af444d93fdd60c56b580db5cc54867d8cbf8a1dc0829938c", size = 1960726 }, - { url = "https://files.pythonhosted.org/packages/84/08/a78cb776f8c085b7143142ce479859cfec086bd09ee638a317040b6ef420/kiwisolver-1.5.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c438f6ca858697c9ab67eb28246c92508af972e114cac34e57a6d4ba17a3ac08", size = 2464738 }, - { url = "https://files.pythonhosted.org/packages/b1/e1/65584da5356ed6cb12c63791a10b208860ac40a83de165cb6a6751a686e3/kiwisolver-1.5.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8c63c91f95173f9c2a67c7c526b2cea976828a0e7fced9cdcead2802dc10f8a4", size = 2270718 }, - { url = "https://files.pythonhosted.org/packages/be/6c/28f17390b62b8f2f520e2915095b3c94d88681ecf0041e75389d9667f202/kiwisolver-1.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:beb7f344487cdcb9e1efe4b7a29681b74d34c08f0043a327a74da852a6749e7b", size = 73480 }, - { url = "https://files.pythonhosted.org/packages/d8/0e/2ee5debc4f77a625778fec5501ff3e8036fe361b7ee28ae402a485bb9694/kiwisolver-1.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:ad4ae4ffd1ee9cd11357b4c66b612da9888f4f4daf2f36995eda64bd45370cac", size = 64930 }, - { url = "https://files.pythonhosted.org/packages/4d/b2/818b74ebea34dabe6d0c51cb1c572e046730e64844da6ed646d5298c40ce/kiwisolver-1.5.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4e9750bc21b886308024f8a54ccb9a2cc38ac9fa813bf4348434e3d54f337ff9", size = 123158 }, - { url = "https://files.pythonhosted.org/packages/bf/d9/405320f8077e8e1c5c4bd6adc45e1e6edf6d727b6da7f2e2533cf58bff71/kiwisolver-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:72ec46b7eba5b395e0a7b63025490d3214c11013f4aacb4f5e8d6c3041829588", size = 66388 }, - { url = "https://files.pythonhosted.org/packages/99/9f/795fedf35634f746151ca8839d05681ceb6287fbed6cc1c9bf235f7887c2/kiwisolver-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ed3a984b31da7481b103f68776f7128a89ef26ed40f4dc41a2223cda7fb24819", size = 64068 }, - { url = "https://files.pythonhosted.org/packages/c4/13/680c54afe3e65767bed7ec1a15571e1a2f1257128733851ade24abcefbcc/kiwisolver-1.5.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bb5136fb5352d3f422df33f0c879a1b0c204004324150cc3b5e3c4f310c9049f", size = 1477934 }, - { url = "https://files.pythonhosted.org/packages/c8/2f/cebfcdb60fd6a9b0f6b47a9337198bcbad6fbe15e68189b7011fd914911f/kiwisolver-1.5.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2af221f268f5af85e776a73d62b0845fc8baf8ef0abfae79d29c77d0e776aaf", size = 1278537 }, - { url = "https://files.pythonhosted.org/packages/f2/0d/9b782923aada3fafb1d6b84e13121954515c669b18af0c26e7d21f579855/kiwisolver-1.5.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b0f172dc8ffaccb8522d7c5d899de00133f2f1ca7b0a49b7da98e901de87bf2d", size = 1296685 }, - { url = "https://files.pythonhosted.org/packages/27/70/83241b6634b04fe44e892688d5208332bde130f38e610c0418f9ede47ded/kiwisolver-1.5.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6ab8ba9152203feec73758dad83af9a0bbe05001eb4639e547207c40cfb52083", size = 1346024 }, - { url = "https://files.pythonhosted.org/packages/e4/db/30ed226fb271ae1a6431fc0fe0edffb2efe23cadb01e798caeb9f2ceae8f/kiwisolver-1.5.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:cdee07c4d7f6d72008d3f73b9bf027f4e11550224c7c50d8df1ae4a37c1402a6", size = 987241 }, - { url = "https://files.pythonhosted.org/packages/ec/bd/c314595208e4c9587652d50959ead9e461995389664e490f4dce7ff0f782/kiwisolver-1.5.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7c60d3c9b06fb23bd9c6139281ccbdc384297579ae037f08ae90c69f6845c0b1", size = 2227742 }, - { url = "https://files.pythonhosted.org/packages/c1/43/0499cec932d935229b5543d073c2b87c9c22846aab48881e9d8d6e742a2d/kiwisolver-1.5.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e315e5ec90d88e140f57696ff85b484ff68bb311e36f2c414aa4286293e6dee0", size = 2323966 }, - { url = "https://files.pythonhosted.org/packages/3d/6f/79b0d760907965acfd9d61826a3d41f8f093c538f55cd2633d3f0db269f6/kiwisolver-1.5.0-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:1465387ac63576c3e125e5337a6892b9e99e0627d52317f3ca79e6930d889d15", size = 1977417 }, - { url = "https://files.pythonhosted.org/packages/ab/31/01d0537c41cb75a551a438c3c7a80d0c60d60b81f694dac83dd436aec0d0/kiwisolver-1.5.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:530a3fd64c87cffa844d4b6b9768774763d9caa299e9b75d8eca6a4423b31314", size = 2491238 }, - { url = "https://files.pythonhosted.org/packages/e4/34/8aefdd0be9cfd00a44509251ba864f5caf2991e36772e61c408007e7f417/kiwisolver-1.5.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1d9daea4ea6b9be74fe2f01f7fbade8d6ffab263e781274cffca0dba9be9eec9", size = 2294947 }, - { url = "https://files.pythonhosted.org/packages/ad/cf/0348374369ca588f8fe9c338fae49fa4e16eeb10ffb3d012f23a54578a9e/kiwisolver-1.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:f18c2d9782259a6dc132fdc7a63c168cbc74b35284b6d75c673958982a378384", size = 73569 }, - { url = "https://files.pythonhosted.org/packages/28/26/192b26196e2316e2bd29deef67e37cdf9870d9af8e085e521afff0fed526/kiwisolver-1.5.0-cp312-cp312-win_arm64.whl", hash = "sha256:f7c7553b13f69c1b29a5bde08ddc6d9d0c8bfb84f9ed01c30db25944aeb852a7", size = 64997 }, - { url = "https://files.pythonhosted.org/packages/9d/69/024d6711d5ba575aa65d5538042e99964104e97fa153a9f10bc369182bc2/kiwisolver-1.5.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:fd40bb9cd0891c4c3cb1ddf83f8bbfa15731a248fdc8162669405451e2724b09", size = 123166 }, - { url = "https://files.pythonhosted.org/packages/ce/48/adbb40df306f587054a348831220812b9b1d787aff714cfbc8556e38fccd/kiwisolver-1.5.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c0e1403fd7c26d77c1f03e096dc58a5c726503fa0db0456678b8668f76f521e3", size = 66395 }, - { url = "https://files.pythonhosted.org/packages/a8/3a/d0a972b34e1c63e2409413104216cd1caa02c5a37cb668d1687d466c1c45/kiwisolver-1.5.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:dda366d548e89a90d88a86c692377d18d8bd64b39c1fb2b92cb31370e2896bbd", size = 64065 }, - { url = "https://files.pythonhosted.org/packages/2b/0a/7b98e1e119878a27ba8618ca1e18b14f992ff1eda40f47bccccf4de44121/kiwisolver-1.5.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:332b4f0145c30b5f5ad9374881133e5aa64320428a57c2c2b61e9d891a51c2f3", size = 1477903 }, - { url = "https://files.pythonhosted.org/packages/18/d8/55638d89ffd27799d5cc3d8aa28e12f4ce7a64d67b285114dbedc8ea4136/kiwisolver-1.5.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c50b89ffd3e1a911c69a1dd3de7173c0cd10b130f56222e57898683841e4f96", size = 1278751 }, - { url = "https://files.pythonhosted.org/packages/b8/97/b4c8d0d18421ecceba20ad8701358453b88e32414e6f6950b5a4bad54e65/kiwisolver-1.5.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4db576bb8c3ef9365f8b40fe0f671644de6736ae2c27a2c62d7d8a1b4329f099", size = 1296793 }, - { url = "https://files.pythonhosted.org/packages/c4/10/f862f94b6389d8957448ec9df59450b81bec4abb318805375c401a1e6892/kiwisolver-1.5.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0b85aad90cea8ac6797a53b5d5f2e967334fa4d1149f031c4537569972596cb8", size = 1346041 }, - { url = "https://files.pythonhosted.org/packages/a3/6a/f1650af35821eaf09de398ec0bc2aefc8f211f0cda50204c9f1673741ba9/kiwisolver-1.5.0-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:d36ca54cb4c6c4686f7cbb7b817f66f5911c12ddb519450bbe86707155028f87", size = 987292 }, - { url = "https://files.pythonhosted.org/packages/de/19/d7fb82984b9238115fe629c915007be608ebd23dc8629703d917dbfaffd4/kiwisolver-1.5.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:38f4a703656f493b0ad185211ccfca7f0386120f022066b018eb5296d8613e23", size = 2227865 }, - { url = "https://files.pythonhosted.org/packages/7f/b9/46b7f386589fd222dac9e9de9c956ce5bcefe2ee73b4e79891381dda8654/kiwisolver-1.5.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3ac2360e93cb41be81121755c6462cff3beaa9967188c866e5fce5cf13170859", size = 2324369 }, - { url = "https://files.pythonhosted.org/packages/92/8b/95e237cf3d9c642960153c769ddcbe278f182c8affb20cecc1cc983e7cc5/kiwisolver-1.5.0-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c95cab08d1965db3d84a121f1c7ce7479bdd4072c9b3dafd8fecce48a2e6b902", size = 1977989 }, - { url = "https://files.pythonhosted.org/packages/1b/95/980c9df53501892784997820136c01f62bc1865e31b82b9560f980c0e649/kiwisolver-1.5.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:fc20894c3d21194d8041a28b65622d5b86db786da6e3cfe73f0c762951a61167", size = 2491645 }, - { url = "https://files.pythonhosted.org/packages/cb/32/900647fd0840abebe1561792c6b31e6a7c0e278fc3973d30572a965ca14c/kiwisolver-1.5.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a32f72973f0f950c1920475d5c5ea3d971b81b6f0ec53b8d0a956cc965f22e0", size = 2295237 }, - { url = "https://files.pythonhosted.org/packages/be/8a/be60e3bbcf513cc5a50f4a3e88e1dcecebb79c1ad607a7222877becaa101/kiwisolver-1.5.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bf3acf1419fa93064a4c2189ac0b58e3be7872bf6ee6177b0d4c63dc4cea276", size = 73573 }, - { url = "https://files.pythonhosted.org/packages/4d/d2/64be2e429eb4fca7f7e1c52a91b12663aeaf25de3895e5cca0f47ef2a8d0/kiwisolver-1.5.0-cp313-cp313-win_arm64.whl", hash = "sha256:fa8eb9ecdb7efb0b226acec134e0d709e87a909fa4971a54c0c4f6e88635484c", size = 64998 }, - { url = "https://files.pythonhosted.org/packages/b0/69/ce68dd0c85755ae2de490bf015b62f2cea5f6b14ff00a463f9d0774449ff/kiwisolver-1.5.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:db485b3847d182b908b483b2ed133c66d88d49cacf98fd278fadafe11b4478d1", size = 125700 }, - { url = "https://files.pythonhosted.org/packages/74/aa/937aac021cf9d4349990d47eb319309a51355ed1dbdc9c077cdc9224cb11/kiwisolver-1.5.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:be12f931839a3bdfe28b584db0e640a65a8bcbc24560ae3fdb025a449b3d754e", size = 67537 }, - { url = "https://files.pythonhosted.org/packages/ee/20/3a87fbece2c40ad0f6f0aefa93542559159c5f99831d596050e8afae7a9f/kiwisolver-1.5.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:16b85d37c2cbb3253226d26e64663f755d88a03439a9c47df6246b35defbdfb7", size = 65514 }, - { url = "https://files.pythonhosted.org/packages/f0/7f/f943879cda9007c45e1f7dba216d705c3a18d6b35830e488b6c6a4e7cdf0/kiwisolver-1.5.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4432b835675f0ea7414aab3d37d119f7226d24869b7a829caeab49ebda407b0c", size = 1584848 }, - { url = "https://files.pythonhosted.org/packages/37/f8/4d4f85cc1870c127c88d950913370dd76138482161cd07eabbc450deff01/kiwisolver-1.5.0-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1b0feb50971481a2cc44d94e88bdb02cdd497618252ae226b8eb1201b957e368", size = 1391542 }, - { url = "https://files.pythonhosted.org/packages/04/0b/65dd2916c84d252b244bd405303220f729e7c17c9d7d33dca6feeff9ffc4/kiwisolver-1.5.0-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:56fa888f10d0f367155e76ce849fa1166fc9730d13bd2d65a2aa13b6f5424489", size = 1404447 }, - { url = "https://files.pythonhosted.org/packages/39/5c/2606a373247babce9b1d056c03a04b65f3cf5290a8eac5d7bdead0a17e21/kiwisolver-1.5.0-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:940dda65d5e764406b9fb92761cbf462e4e63f712ab60ed98f70552e496f3bf1", size = 1455918 }, - { url = "https://files.pythonhosted.org/packages/d5/d1/c6078b5756670658e9192a2ef11e939c92918833d2745f85cd14a6004bdf/kiwisolver-1.5.0-cp313-cp313t-manylinux_2_39_riscv64.whl", hash = "sha256:89fc958c702ee9a745e4700378f5d23fddbc46ff89e8fdbf5395c24d5c1452a3", size = 1072856 }, - { url = "https://files.pythonhosted.org/packages/cb/c8/7def6ddf16eb2b3741d8b172bdaa9af882b03c78e9b0772975408801fa63/kiwisolver-1.5.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9027d773c4ff81487181a925945743413f6069634d0b122d0b37684ccf4f1e18", size = 2333580 }, - { url = "https://files.pythonhosted.org/packages/9e/87/2ac1fce0eb1e616fcd3c35caa23e665e9b1948bb984f4764790924594128/kiwisolver-1.5.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:5b233ea3e165e43e35dba1d2b8ecc21cf070b45b65ae17dd2747d2713d942021", size = 2423018 }, - { url = "https://files.pythonhosted.org/packages/67/13/c6700ccc6cc218716bfcda4935e4b2997039869b4ad8a94f364c5a3b8e63/kiwisolver-1.5.0-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ce9bf03dad3b46408c08649c6fbd6ca28a9fce0eb32fdfffa6775a13103b5310", size = 2062804 }, - { url = "https://files.pythonhosted.org/packages/1b/bd/877056304626943ff0f1f44c08f584300c199b887cb3176cd7e34f1515f1/kiwisolver-1.5.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:fc4d3f1fb9ca0ae9f97b095963bc6326f1dbfd3779d6679a1e016b9baaa153d3", size = 2597482 }, - { url = "https://files.pythonhosted.org/packages/75/19/c60626c47bf0f8ac5dcf72c6c98e266d714f2fbbfd50cf6dab5ede3aaa50/kiwisolver-1.5.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f443b4825c50a51ee68585522ab4a1d1257fac65896f282b4c6763337ac9f5d2", size = 2394328 }, - { url = "https://files.pythonhosted.org/packages/47/84/6a6d5e5bb8273756c27b7d810d47f7ef2f1f9b9fd23c9ee9a3f8c75c9cef/kiwisolver-1.5.0-cp313-cp313t-win_arm64.whl", hash = "sha256:893ff3a711d1b515ba9da14ee090519bad4610ed1962fbe298a434e8c5f8db53", size = 68410 }, - { url = "https://files.pythonhosted.org/packages/e4/d7/060f45052f2a01ad5762c8fdecd6d7a752b43400dc29ff75cd47225a40fd/kiwisolver-1.5.0-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8df31fe574b8b3993cc61764f40941111b25c2d9fea13d3ce24a49907cd2d615", size = 123231 }, - { url = "https://files.pythonhosted.org/packages/c2/a7/78da680eadd06ff35edef6ef68a1ad273bad3e2a0936c9a885103230aece/kiwisolver-1.5.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1d49a49ac4cbfb7c1375301cd1ec90169dfeae55ff84710d782260ce77a75a02", size = 66489 }, - { url = "https://files.pythonhosted.org/packages/49/b2/97980f3ad4fae37dd7fe31626e2bf75fbf8bdf5d303950ec1fab39a12da8/kiwisolver-1.5.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0cbe94b69b819209a62cb27bdfa5dc2a8977d8de2f89dfd97ba4f53ed3af754e", size = 64063 }, - { url = "https://files.pythonhosted.org/packages/e7/f9/b06c934a6aa8bc91f566bd2a214fd04c30506c2d9e2b6b171953216a65b6/kiwisolver-1.5.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:80aa065ffd378ff784822a6d7c3212f2d5f5e9c3589614b5c228b311fd3063ac", size = 1475913 }, - { url = "https://files.pythonhosted.org/packages/6b/f0/f768ae564a710135630672981231320bc403cf9152b5596ec5289de0f106/kiwisolver-1.5.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4e7f886f47ab881692f278ae901039a234e4025a68e6dfab514263a0b1c4ae05", size = 1282782 }, - { url = "https://files.pythonhosted.org/packages/e2/9f/1de7aad00697325f05238a5f2eafbd487fb637cc27a558b5367a5f37fb7f/kiwisolver-1.5.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5060731cc3ed12ca3a8b57acd4aeca5bbc2f49216dd0bec1650a1acd89486bcd", size = 1300815 }, - { url = "https://files.pythonhosted.org/packages/5a/c2/297f25141d2e468e0ce7f7a7b92e0cf8918143a0cbd3422c1ad627e85a06/kiwisolver-1.5.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a4aa69609f40fce3cbc3f87b2061f042eee32f94b8f11db707b66a26461591a", size = 1347925 }, - { url = "https://files.pythonhosted.org/packages/b9/d3/f4c73a02eb41520c47610207b21afa8cdd18fdbf64ffd94674ae21c4812d/kiwisolver-1.5.0-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:d168fda2dbff7b9b5f38e693182d792a938c31db4dac3a80a4888de603c99554", size = 991322 }, - { url = "https://files.pythonhosted.org/packages/7b/46/d3f2efef7732fcda98d22bf4ad5d3d71d545167a852ca710a494f4c15343/kiwisolver-1.5.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:413b820229730d358efd838ecbab79902fe97094565fdc80ddb6b0a18c18a581", size = 2232857 }, - { url = "https://files.pythonhosted.org/packages/3f/ec/2d9756bf2b6d26ae4349b8d3662fb3993f16d80c1f971c179ce862b9dbae/kiwisolver-1.5.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5124d1ea754509b09e53738ec185584cc609aae4a3b510aaf4ed6aa047ef9303", size = 2329376 }, - { url = "https://files.pythonhosted.org/packages/8f/9f/876a0a0f2260f1bde92e002b3019a5fabc35e0939c7d945e0fa66185eb20/kiwisolver-1.5.0-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e4415a8db000bf49a6dd1c478bf70062eaacff0f462b92b0ba68791a905861f9", size = 1982549 }, - { url = "https://files.pythonhosted.org/packages/6c/4f/ba3624dfac23a64d54ac4179832860cb537c1b0af06024936e82ca4154a0/kiwisolver-1.5.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d618fd27420381a4f6044faa71f46d8bfd911bd077c555f7138ed88729bfbe79", size = 2494680 }, - { url = "https://files.pythonhosted.org/packages/39/b7/97716b190ab98911b20d10bf92eca469121ec483b8ce0edd314f51bc85af/kiwisolver-1.5.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5092eb5b1172947f57d6ea7d89b2f29650414e4293c47707eb499ec07a0ac796", size = 2297905 }, - { url = "https://files.pythonhosted.org/packages/a3/36/4e551e8aa55c9188bca9abb5096805edbf7431072b76e2298e34fd3a3008/kiwisolver-1.5.0-cp314-cp314-win_amd64.whl", hash = "sha256:d76e2d8c75051d58177e762164d2e9ab92886534e3a12e795f103524f221dd8e", size = 75086 }, - { url = "https://files.pythonhosted.org/packages/70/15/9b90f7df0e31a003c71649cf66ef61c3c1b862f48c81007fa2383c8bd8d7/kiwisolver-1.5.0-cp314-cp314-win_arm64.whl", hash = "sha256:fa6248cd194edff41d7ea9425ced8ca3a6f838bfb295f6f1d6e6bb694a8518df", size = 66577 }, - { url = "https://files.pythonhosted.org/packages/17/01/7dc8c5443ff42b38e72731643ed7cf1ed9bf01691ae5cdca98501999ed83/kiwisolver-1.5.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:d1ffeb80b5676463d7a7d56acbe8e37a20ce725570e09549fe738e02ca6b7e1e", size = 125794 }, - { url = "https://files.pythonhosted.org/packages/46/8a/b4ebe46ebaac6a303417fab10c2e165c557ddaff558f9699d302b256bc53/kiwisolver-1.5.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc4d8e252f532ab46a1de9349e2d27b91fce46736a9eedaa37beaca66f574ed4", size = 67646 }, - { url = "https://files.pythonhosted.org/packages/60/35/10a844afc5f19d6f567359bf4789e26661755a2f36200d5d1ed8ad0126e5/kiwisolver-1.5.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6783e069732715ad0c3ce96dbf21dbc2235ab0593f2baf6338101f70371f4028", size = 65511 }, - { url = "https://files.pythonhosted.org/packages/f8/8a/685b297052dd041dcebce8e8787b58923b6e78acc6115a0dc9189011c44b/kiwisolver-1.5.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e7c4c09a490dc4d4a7f8cbee56c606a320f9dc28cf92a7157a39d1ce7676a657", size = 1584858 }, - { url = "https://files.pythonhosted.org/packages/9e/80/04865e3d4638ac5bddec28908916df4a3075b8c6cc101786a96803188b96/kiwisolver-1.5.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2a075bd7bd19c70cf67c8badfa36cf7c5d8de3c9ddb8420c51e10d9c50e94920", size = 1392539 }, - { url = "https://files.pythonhosted.org/packages/ba/01/77a19cacc0893fa13fafa46d1bba06fb4dc2360b3292baf4b56d8e067b24/kiwisolver-1.5.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bdd3e53429ff02aa319ba59dfe4ceeec345bf46cf180ec2cf6fd5b942e7975e9", size = 1405310 }, - { url = "https://files.pythonhosted.org/packages/53/39/bcaf5d0cca50e604cfa9b4e3ae1d64b50ca1ae5b754122396084599ef903/kiwisolver-1.5.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3cdcb35dc9d807259c981a85531048ede628eabcffb3239adf3d17463518992d", size = 1456244 }, - { url = "https://files.pythonhosted.org/packages/d0/7a/72c187abc6975f6978c3e39b7cf67aeb8b3c0a8f9790aa7fd412855e9e1f/kiwisolver-1.5.0-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:70d593af6a6ca332d1df73d519fddb5148edb15cd90d5f0155e3746a6d4fcc65", size = 1073154 }, - { url = "https://files.pythonhosted.org/packages/c7/ca/cf5b25783ebbd59143b4371ed0c8428a278abe68d6d0104b01865b1bbd0f/kiwisolver-1.5.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:377815a8616074cabbf3f53354e1d040c35815a134e01d7614b7692e4bf8acfa", size = 2334377 }, - { url = "https://files.pythonhosted.org/packages/4a/e5/b1f492adc516796e88751282276745340e2a72dcd0d36cf7173e0daf3210/kiwisolver-1.5.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0255a027391d52944eae1dbb5d4cc5903f57092f3674e8e544cdd2622826b3f0", size = 2425288 }, - { url = "https://files.pythonhosted.org/packages/e6/e5/9b21fbe91a61b8f409d74a26498706e97a48008bfcd1864373d32a6ba31c/kiwisolver-1.5.0-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:012b1eb16e28718fa782b5e61dc6f2da1f0792ca73bd05d54de6cb9561665fc9", size = 2063158 }, - { url = "https://files.pythonhosted.org/packages/b1/02/83f47986138310f95ea95531f851b2a62227c11cbc3e690ae1374fe49f0f/kiwisolver-1.5.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0e3aafb33aed7479377e5e9a82e9d4bf87063741fc99fc7ae48b0f16e32bdd6f", size = 2597260 }, - { url = "https://files.pythonhosted.org/packages/07/18/43a5f24608d8c313dd189cf838c8e68d75b115567c6279de7796197cfb6a/kiwisolver-1.5.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:e7a116ae737f0000343218c4edf5bd45893bfeaff0993c0b215d7124c9f77646", size = 2394403 }, - { url = "https://files.pythonhosted.org/packages/3b/b5/98222136d839b8afabcaa943b09bd05888c2d36355b7e448550211d1fca4/kiwisolver-1.5.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1dd9b0b119a350976a6d781e7278ec7aca0b201e1a9e2d23d9804afecb6ca681", size = 79687 }, - { url = "https://files.pythonhosted.org/packages/99/a2/ca7dc962848040befed12732dff6acae7fb3c4f6fc4272b3f6c9a30b8713/kiwisolver-1.5.0-cp314-cp314t-win_arm64.whl", hash = "sha256:58f812017cd2985c21fbffb4864d59174d4903dd66fa23815e74bbc7a0e2dd57", size = 70032 }, - { url = "https://files.pythonhosted.org/packages/1c/fa/2910df836372d8761bb6eff7d8bdcb1613b5c2e03f260efe7abe34d388a7/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-macosx_10_13_x86_64.whl", hash = "sha256:5ae8e62c147495b01a0f4765c878e9bfdf843412446a247e28df59936e99e797", size = 130262 }, - { url = "https://files.pythonhosted.org/packages/0f/41/c5f71f9f00aabcc71fee8b7475e3f64747282580c2fe748961ba29b18385/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f6764a4ccab3078db14a632420930f6186058750df066b8ea2a7106df91d3203", size = 138036 }, - { url = "https://files.pythonhosted.org/packages/fa/06/7399a607f434119c6e1fdc8ec89a8d51ccccadf3341dee4ead6bd14caaf5/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c31c13da98624f957b0fb1b5bae5383b2333c2c3f6793d9825dd5ce79b525cb7", size = 194295 }, - { url = "https://files.pythonhosted.org/packages/b5/91/53255615acd2a1eaca307ede3c90eb550bae9c94581f8c00081b6b1c8f44/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:1f1489f769582498610e015a8ef2d36f28f505ab3096d0e16b4858a9ec214f57", size = 75987 }, - { url = "https://files.pythonhosted.org/packages/e9/eb/5fcbbbf9a0e2c3a35effb88831a483345326bbc3a030a3b5b69aee647f84/kiwisolver-1.5.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ec4c85dc4b687c7f7f15f553ff26a98bfe8c58f5f7f0ac8905f0ba4c7be60232", size = 59532 }, - { url = "https://files.pythonhosted.org/packages/c3/9b/e17104555bb4db148fd52327feea1e96be4b88e8e008b029002c281a21ab/kiwisolver-1.5.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:12e91c215a96e39f57989c8912ae761286ac5a9584d04030ceb3368a357f017a", size = 57420 }, - { url = "https://files.pythonhosted.org/packages/48/44/2b5b95b7aa39fb2d8d9d956e0f3d5d45aef2ae1d942d4c3ffac2f9cfed1a/kiwisolver-1.5.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:be4a51a55833dc29ab5d7503e7bcb3b3af3402d266018137127450005cdfe737", size = 79892 }, - { url = "https://files.pythonhosted.org/packages/52/7d/7157f9bba6b455cfb4632ed411e199fc8b8977642c2b12082e1bd9e6d173/kiwisolver-1.5.0-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:daae526907e262de627d8f70058a0f64acc9e2641c164c99c8f594b34a799a16", size = 77603 }, - { url = "https://files.pythonhosted.org/packages/0a/dd/8050c947d435c8d4bc94e3252f4d8bb8a76cfb424f043a8680be637a57f1/kiwisolver-1.5.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:59cd8683f575d96df5bb48f6add94afc055012c29e28124fcae2b63661b9efb1", size = 73558 }, -] - [[package]] name = "linkify-it-py" version = "2.0.3" @@ -1679,12 +1374,11 @@ wheels = [ [[package]] name = "litellm" -version = "1.82.3" +version = "1.75.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, { name = "click" }, - { name = "fastuuid" }, { name = "httpx" }, { name = "importlib-metadata" }, { name = "jinja2" }, @@ -1695,9 +1389,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/53/00/13993312e6d2fb29cd6d5ffceb293455ef747fe5675eaa9aa49b09184656/litellm-1.82.3.tar.gz", hash = "sha256:7215b95e7cc38a52b5ae778d67e8829dec86594c8b05d8431294e95c7d59937c", size = 17368754 } +sdist = { url = "https://files.pythonhosted.org/packages/8d/4e/48e3d6de19afe713223e3bc7009a2003501420de2a5d823c569cefbd9731/litellm-1.75.8.tar.gz", hash = "sha256:92061bd263ff8c33c8fff70ba92cd046adb7ea041a605826a915d108742fe59e", size = 10140384 } wheels = [ - { url = "https://files.pythonhosted.org/packages/5a/3a/590d58dee65a238f7f3d5c37f8f9f9021ecaf27fe379a393b4259324b56e/litellm-1.82.3-py3-none-any.whl", hash = "sha256:609901f6c5a5cf8c24386e4e3f50738bb8a9db719709fd76b208c8ee6d00f7a7", size = 15551034 }, + { url = "https://files.pythonhosted.org/packages/5e/82/c4d00fbeafd93c00dab6ea03f33cadd6a97adeb720ba1d89fc319e5cb10b/litellm-1.75.8-py3-none-any.whl", hash = "sha256:0bf004488df8506381ec6e35e1486e2870e8d578a7c3f2427cd497558ce07a2e", size = 8916305 }, ] [[package]] @@ -1791,70 +1485,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146 }, ] -[[package]] -name = "matplotlib" -version = "3.10.8" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "contourpy" }, - { name = "cycler" }, - { name = "fonttools" }, - { name = "kiwisolver" }, - { name = "numpy" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "pyparsing" }, - { name = "python-dateutil" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f8/86/de7e3a1cdcfc941483af70609edc06b83e7c8a0e0dc9ac325200a3f4d220/matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160", size = 8251215 }, - { url = "https://files.pythonhosted.org/packages/fd/14/baad3222f424b19ce6ad243c71de1ad9ec6b2e4eb1e458a48fdc6d120401/matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78", size = 8139625 }, - { url = "https://files.pythonhosted.org/packages/8f/a0/7024215e95d456de5883e6732e708d8187d9753a21d32f8ddb3befc0c445/matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4", size = 8712614 }, - { url = "https://files.pythonhosted.org/packages/5a/f4/b8347351da9a5b3f41e26cf547252d861f685c6867d179a7c9d60ad50189/matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2", size = 9540997 }, - { url = "https://files.pythonhosted.org/packages/9e/c0/c7b914e297efe0bc36917bf216b2acb91044b91e930e878ae12981e461e5/matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6", size = 9596825 }, - { url = "https://files.pythonhosted.org/packages/6f/d3/a4bbc01c237ab710a1f22b4da72f4ff6d77eb4c7735ea9811a94ae239067/matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9", size = 8135090 }, - { url = "https://files.pythonhosted.org/packages/89/dd/a0b6588f102beab33ca6f5218b31725216577b2a24172f327eaf6417d5c9/matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2", size = 8012377 }, - { url = "https://files.pythonhosted.org/packages/9e/67/f997cdcbb514012eb0d10cd2b4b332667997fb5ebe26b8d41d04962fa0e6/matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a", size = 8260453 }, - { url = "https://files.pythonhosted.org/packages/7e/65/07d5f5c7f7c994f12c768708bd2e17a4f01a2b0f44a1c9eccad872433e2e/matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58", size = 8148321 }, - { url = "https://files.pythonhosted.org/packages/3e/f3/c5195b1ae57ef85339fd7285dfb603b22c8b4e79114bae5f4f0fcf688677/matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04", size = 8716944 }, - { url = "https://files.pythonhosted.org/packages/00/f9/7638f5cc82ec8a7aa005de48622eecc3ed7c9854b96ba15bd76b7fd27574/matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f", size = 9550099 }, - { url = "https://files.pythonhosted.org/packages/57/61/78cd5920d35b29fd2a0fe894de8adf672ff52939d2e9b43cb83cd5ce1bc7/matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466", size = 9613040 }, - { url = "https://files.pythonhosted.org/packages/30/4e/c10f171b6e2f44d9e3a2b96efa38b1677439d79c99357600a62cc1e9594e/matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf", size = 8142717 }, - { url = "https://files.pythonhosted.org/packages/f1/76/934db220026b5fef85f45d51a738b91dea7d70207581063cd9bd8fafcf74/matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b", size = 8012751 }, - { url = "https://files.pythonhosted.org/packages/3d/b9/15fd5541ef4f5b9a17eefd379356cf12175fe577424e7b1d80676516031a/matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6", size = 8261076 }, - { url = "https://files.pythonhosted.org/packages/8d/a0/2ba3473c1b66b9c74dc7107c67e9008cb1782edbe896d4c899d39ae9cf78/matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1", size = 8148794 }, - { url = "https://files.pythonhosted.org/packages/75/97/a471f1c3eb1fd6f6c24a31a5858f443891d5127e63a7788678d14e249aea/matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486", size = 8718474 }, - { url = "https://files.pythonhosted.org/packages/01/be/cd478f4b66f48256f42927d0acbcd63a26a893136456cd079c0cc24fbabf/matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce", size = 9549637 }, - { url = "https://files.pythonhosted.org/packages/5d/7c/8dc289776eae5109e268c4fb92baf870678dc048a25d4ac903683b86d5bf/matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6", size = 9613678 }, - { url = "https://files.pythonhosted.org/packages/64/40/37612487cc8a437d4dd261b32ca21fe2d79510fe74af74e1f42becb1bdb8/matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149", size = 8142686 }, - { url = "https://files.pythonhosted.org/packages/66/52/8d8a8730e968185514680c2a6625943f70269509c3dcfc0dcf7d75928cb8/matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645", size = 8012917 }, - { url = "https://files.pythonhosted.org/packages/b5/27/51fe26e1062f298af5ef66343d8ef460e090a27fea73036c76c35821df04/matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077", size = 8305679 }, - { url = "https://files.pythonhosted.org/packages/2c/1e/4de865bc591ac8e3062e835f42dd7fe7a93168d519557837f0e37513f629/matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22", size = 8198336 }, - { url = "https://files.pythonhosted.org/packages/c6/cb/2f7b6e75fb4dce87ef91f60cac4f6e34f4c145ab036a22318ec837971300/matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39", size = 8731653 }, - { url = "https://files.pythonhosted.org/packages/46/b3/bd9c57d6ba670a37ab31fb87ec3e8691b947134b201f881665b28cc039ff/matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565", size = 9561356 }, - { url = "https://files.pythonhosted.org/packages/c0/3d/8b94a481456dfc9dfe6e39e93b5ab376e50998cddfd23f4ae3b431708f16/matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a", size = 9614000 }, - { url = "https://files.pythonhosted.org/packages/bd/cd/bc06149fe5585ba800b189a6a654a75f1f127e8aab02fd2be10df7fa500c/matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958", size = 8220043 }, - { url = "https://files.pythonhosted.org/packages/e3/de/b22cf255abec916562cc04eef457c13e58a1990048de0c0c3604d082355e/matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5", size = 8062075 }, - { url = "https://files.pythonhosted.org/packages/3c/43/9c0ff7a2f11615e516c3b058e1e6e8f9614ddeca53faca06da267c48345d/matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f", size = 8262481 }, - { url = "https://files.pythonhosted.org/packages/6f/ca/e8ae28649fcdf039fda5ef554b40a95f50592a3c47e6f7270c9561c12b07/matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b", size = 8151473 }, - { url = "https://files.pythonhosted.org/packages/f1/6f/009d129ae70b75e88cbe7e503a12a4c0670e08ed748a902c2568909e9eb5/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d", size = 9553896 }, - { url = "https://files.pythonhosted.org/packages/f5/26/4221a741eb97967bc1fd5e4c52b9aa5a91b2f4ec05b59f6def4d820f9df9/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008", size = 9824193 }, - { url = "https://files.pythonhosted.org/packages/1f/f3/3abf75f38605772cf48a9daf5821cd4f563472f38b4b828c6fba6fa6d06e/matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c", size = 9615444 }, - { url = "https://files.pythonhosted.org/packages/93/a5/de89ac80f10b8dc615807ee1133cd99ac74082581196d4d9590bea10690d/matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11", size = 8272719 }, - { url = "https://files.pythonhosted.org/packages/69/ce/b006495c19ccc0a137b48083168a37bd056392dee02f87dba0472f2797fe/matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8", size = 8144205 }, - { url = "https://files.pythonhosted.org/packages/68/d9/b31116a3a855bd313c6fcdb7226926d59b041f26061c6c5b1be66a08c826/matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50", size = 8305785 }, - { url = "https://files.pythonhosted.org/packages/1e/90/6effe8103f0272685767ba5f094f453784057072f49b393e3ea178fe70a5/matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908", size = 8198361 }, - { url = "https://files.pythonhosted.org/packages/d7/65/a73188711bea603615fc0baecca1061429ac16940e2385433cc778a9d8e7/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a", size = 9561357 }, - { url = "https://files.pythonhosted.org/packages/f4/3d/b5c5d5d5be8ce63292567f0e2c43dde9953d3ed86ac2de0a72e93c8f07a1/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1", size = 9823610 }, - { url = "https://files.pythonhosted.org/packages/4d/4b/e7beb6bbd49f6bae727a12b270a2654d13c397576d25bd6786e47033300f/matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c", size = 9614011 }, - { url = "https://files.pythonhosted.org/packages/7c/e6/76f2813d31f032e65f6f797e3f2f6e4aab95b65015924b1c51370395c28a/matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b", size = 8362801 }, - { url = "https://files.pythonhosted.org/packages/5d/49/d651878698a0b67f23aa28e17f45a6d6dd3d3f933fa29087fa4ce5947b5a/matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f", size = 8192560 }, - { url = "https://files.pythonhosted.org/packages/04/30/3afaa31c757f34b7725ab9d2ba8b48b5e89c2019c003e7d0ead143aabc5a/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1", size = 8249198 }, - { url = "https://files.pythonhosted.org/packages/48/2f/6334aec331f57485a642a7c8be03cb286f29111ae71c46c38b363230063c/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a", size = 8136817 }, - { url = "https://files.pythonhosted.org/packages/73/e4/6d6f14b2a759c622f191b2d67e9075a3f56aaccb3be4bb9bb6890030d0a0/matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2", size = 8713867 }, -] - [[package]] name = "mcp" version = "1.26.0" @@ -2345,93 +1975,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/29/c028a0731e202035f0e2e0bfbf1a3e46ad6c628cbb17f6f1cc9eea5d9ff1/pathlib_abc-0.5.2-py3-none-any.whl", hash = "sha256:4c9d94cf1b23af417ce7c0417b43333b06a106c01000b286c99de230d95eefbb", size = 19070 }, ] -[[package]] -name = "pillow" -version = "12.1.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/1f/42/5c74462b4fd957fcd7b13b04fb3205ff8349236ea74c7c375766d6c82288/pillow-12.1.1.tar.gz", hash = "sha256:9ad8fa5937ab05218e2b6a4cff30295ad35afd2f83ac592e68c0d871bb0fdbc4", size = 46980264 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2b/46/5da1ec4a5171ee7bf1a0efa064aba70ba3d6e0788ce3f5acd1375d23c8c0/pillow-12.1.1-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:e879bb6cd5c73848ef3b2b48b8af9ff08c5b71ecda8048b7dd22d8a33f60be32", size = 5304084 }, - { url = "https://files.pythonhosted.org/packages/78/93/a29e9bc02d1cf557a834da780ceccd54e02421627200696fcf805ebdc3fb/pillow-12.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:365b10bb9417dd4498c0e3b128018c4a624dc11c7b97d8cc54effe3b096f4c38", size = 4657866 }, - { url = "https://files.pythonhosted.org/packages/13/84/583a4558d492a179d31e4aae32eadce94b9acf49c0337c4ce0b70e0a01f2/pillow-12.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d4ce8e329c93845720cd2014659ca67eac35f6433fd3050393d85f3ecef0dad5", size = 6232148 }, - { url = "https://files.pythonhosted.org/packages/d5/e2/53c43334bbbb2d3b938978532fbda8e62bb6e0b23a26ce8592f36bcc4987/pillow-12.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc354a04072b765eccf2204f588a7a532c9511e8b9c7f900e1b64e3e33487090", size = 8038007 }, - { url = "https://files.pythonhosted.org/packages/b8/a6/3d0e79c8a9d58150dd98e199d7c1c56861027f3829a3a60b3c2784190180/pillow-12.1.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7e7976bf1910a8116b523b9f9f58bf410f3e8aa330cd9a2bb2953f9266ab49af", size = 6345418 }, - { url = "https://files.pythonhosted.org/packages/a2/c8/46dfeac5825e600579157eea177be43e2f7ff4a99da9d0d0a49533509ac5/pillow-12.1.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:597bd9c8419bc7c6af5604e55847789b69123bbe25d65cc6ad3012b4f3c98d8b", size = 7034590 }, - { url = "https://files.pythonhosted.org/packages/af/bf/e6f65d3db8a8bbfeaf9e13cc0417813f6319863a73de934f14b2229ada18/pillow-12.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:2c1fc0f2ca5f96a3c8407e41cca26a16e46b21060fe6d5b099d2cb01412222f5", size = 6458655 }, - { url = "https://files.pythonhosted.org/packages/f9/c2/66091f3f34a25894ca129362e510b956ef26f8fb67a0e6417bc5744e56f1/pillow-12.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:578510d88c6229d735855e1f278aa305270438d36a05031dfaae5067cc8eb04d", size = 7159286 }, - { url = "https://files.pythonhosted.org/packages/7b/5a/24bc8eb526a22f957d0cec6243146744966d40857e3d8deb68f7902ca6c1/pillow-12.1.1-cp311-cp311-win32.whl", hash = "sha256:7311c0a0dcadb89b36b7025dfd8326ecfa36964e29913074d47382706e516a7c", size = 6328663 }, - { url = "https://files.pythonhosted.org/packages/31/03/bef822e4f2d8f9d7448c133d0a18185d3cce3e70472774fffefe8b0ed562/pillow-12.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:fbfa2a7c10cc2623f412753cddf391c7f971c52ca40a3f65dc5039b2939e8563", size = 7031448 }, - { url = "https://files.pythonhosted.org/packages/49/70/f76296f53610bd17b2e7d31728b8b7825e3ac3b5b3688b51f52eab7c0818/pillow-12.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:b81b5e3511211631b3f672a595e3221252c90af017e399056d0faabb9538aa80", size = 2453651 }, - { url = "https://files.pythonhosted.org/packages/07/d3/8df65da0d4df36b094351dce696f2989bec731d4f10e743b1c5f4da4d3bf/pillow-12.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab323b787d6e18b3d91a72fc99b1a2c28651e4358749842b8f8dfacd28ef2052", size = 5262803 }, - { url = "https://files.pythonhosted.org/packages/d6/71/5026395b290ff404b836e636f51d7297e6c83beceaa87c592718747e670f/pillow-12.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:adebb5bee0f0af4909c30db0d890c773d1a92ffe83da908e2e9e720f8edf3984", size = 4657601 }, - { url = "https://files.pythonhosted.org/packages/b1/2e/1001613d941c67442f745aff0f7cc66dd8df9a9c084eb497e6a543ee6f7e/pillow-12.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb66b7cc26f50977108790e2456b7921e773f23db5630261102233eb355a3b79", size = 6234995 }, - { url = "https://files.pythonhosted.org/packages/07/26/246ab11455b2549b9233dbd44d358d033a2f780fa9007b61a913c5b2d24e/pillow-12.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:aee2810642b2898bb187ced9b349e95d2a7272930796e022efaf12e99dccd293", size = 8045012 }, - { url = "https://files.pythonhosted.org/packages/b2/8b/07587069c27be7535ac1fe33874e32de118fbd34e2a73b7f83436a88368c/pillow-12.1.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0b1cd6232e2b618adcc54d9882e4e662a089d5768cd188f7c245b4c8c44a397", size = 6349638 }, - { url = "https://files.pythonhosted.org/packages/ff/79/6df7b2ee763d619cda2fb4fea498e5f79d984dae304d45a8999b80d6cf5c/pillow-12.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7aac39bcf8d4770d089588a2e1dd111cbaa42df5a94be3114222057d68336bd0", size = 7041540 }, - { url = "https://files.pythonhosted.org/packages/2c/5e/2ba19e7e7236d7529f4d873bdaf317a318896bac289abebd4bb00ef247f0/pillow-12.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ab174cd7d29a62dd139c44bf74b698039328f45cb03b4596c43473a46656b2f3", size = 6462613 }, - { url = "https://files.pythonhosted.org/packages/03/03/31216ec124bb5c3dacd74ce8efff4cc7f52643653bad4825f8f08c697743/pillow-12.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:339ffdcb7cbeaa08221cd401d517d4b1fe7a9ed5d400e4a8039719238620ca35", size = 7166745 }, - { url = "https://files.pythonhosted.org/packages/1f/e7/7c4552d80052337eb28653b617eafdef39adfb137c49dd7e831b8dc13bc5/pillow-12.1.1-cp312-cp312-win32.whl", hash = "sha256:5d1f9575a12bed9e9eedd9a4972834b08c97a352bd17955ccdebfeca5913fa0a", size = 6328823 }, - { url = "https://files.pythonhosted.org/packages/3d/17/688626d192d7261bbbf98846fc98995726bddc2c945344b65bec3a29d731/pillow-12.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:21329ec8c96c6e979cd0dfd29406c40c1d52521a90544463057d2aaa937d66a6", size = 7033367 }, - { url = "https://files.pythonhosted.org/packages/ed/fe/a0ef1f73f939b0eca03ee2c108d0043a87468664770612602c63266a43c4/pillow-12.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:af9a332e572978f0218686636610555ae3defd1633597be015ed50289a03c523", size = 2453811 }, - { url = "https://files.pythonhosted.org/packages/d5/11/6db24d4bd7685583caeae54b7009584e38da3c3d4488ed4cd25b439de486/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:d242e8ac078781f1de88bf823d70c1a9b3c7950a44cdf4b7c012e22ccbcd8e4e", size = 4062689 }, - { url = "https://files.pythonhosted.org/packages/33/c0/ce6d3b1fe190f0021203e0d9b5b99e57843e345f15f9ef22fcd43842fd21/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:02f84dfad02693676692746df05b89cf25597560db2857363a208e393429f5e9", size = 4138535 }, - { url = "https://files.pythonhosted.org/packages/a0/c6/d5eb6a4fb32a3f9c21a8c7613ec706534ea1cf9f4b3663e99f0d83f6fca8/pillow-12.1.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:e65498daf4b583091ccbb2556c7000abf0f3349fcd57ef7adc9a84a394ed29f6", size = 3601364 }, - { url = "https://files.pythonhosted.org/packages/14/a1/16c4b823838ba4c9c52c0e6bbda903a3fe5a1bdbf1b8eb4fff7156f3e318/pillow-12.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c6db3b84c87d48d0088943bf33440e0c42370b99b1c2a7989216f7b42eede60", size = 5262561 }, - { url = "https://files.pythonhosted.org/packages/bb/ad/ad9dc98ff24f485008aa5cdedaf1a219876f6f6c42a4626c08bc4e80b120/pillow-12.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8b7e5304e34942bf62e15184219a7b5ad4ff7f3bb5cca4d984f37df1a0e1aee2", size = 4657460 }, - { url = "https://files.pythonhosted.org/packages/9e/1b/f1a4ea9a895b5732152789326202a82464d5254759fbacae4deea3069334/pillow-12.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:18e5bddd742a44b7e6b1e773ab5db102bd7a94c32555ba656e76d319d19c3850", size = 6232698 }, - { url = "https://files.pythonhosted.org/packages/95/f4/86f51b8745070daf21fd2e5b1fe0eb35d4db9ca26e6d58366562fb56a743/pillow-12.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc44ef1f3de4f45b50ccf9136999d71abb99dca7706bc75d222ed350b9fd2289", size = 8041706 }, - { url = "https://files.pythonhosted.org/packages/29/9b/d6ecd956bb1266dd1045e995cce9b8d77759e740953a1c9aad9502a0461e/pillow-12.1.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a8eb7ed8d4198bccbd07058416eeec51686b498e784eda166395a23eb99138e", size = 6346621 }, - { url = "https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47b94983da0c642de92ced1702c5b6c292a84bd3a8e1d1702ff923f183594717", size = 7038069 }, - { url = "https://files.pythonhosted.org/packages/94/0e/58cb1a6bc48f746bc4cb3adb8cabff73e2742c92b3bf7a220b7cf69b9177/pillow-12.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:518a48c2aab7ce596d3bf79d0e275661b846e86e4d0e7dec34712c30fe07f02a", size = 6460040 }, - { url = "https://files.pythonhosted.org/packages/6c/57/9045cb3ff11eeb6c1adce3b2d60d7d299d7b273a2e6c8381a524abfdc474/pillow-12.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a550ae29b95c6dc13cf69e2c9dc5747f814c54eeb2e32d683e5e93af56caa029", size = 7164523 }, - { url = "https://files.pythonhosted.org/packages/73/f2/9be9cb99f2175f0d4dbadd6616ce1bf068ee54a28277ea1bf1fbf729c250/pillow-12.1.1-cp313-cp313-win32.whl", hash = "sha256:a003d7422449f6d1e3a34e3dd4110c22148336918ddbfc6a32581cd54b2e0b2b", size = 6332552 }, - { url = "https://files.pythonhosted.org/packages/3f/eb/b0834ad8b583d7d9d42b80becff092082a1c3c156bb582590fcc973f1c7c/pillow-12.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:344cf1e3dab3be4b1fa08e449323d98a2a3f819ad20f4b22e77a0ede31f0faa1", size = 7040108 }, - { url = "https://files.pythonhosted.org/packages/d5/7d/fc09634e2aabdd0feabaff4a32f4a7d97789223e7c2042fd805ea4b4d2c2/pillow-12.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:5c0dd1636633e7e6a0afe7bf6a51a14992b7f8e60de5789018ebbdfae55b040a", size = 2453712 }, - { url = "https://files.pythonhosted.org/packages/19/2a/b9d62794fc8a0dd14c1943df68347badbd5511103e0d04c035ffe5cf2255/pillow-12.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0330d233c1a0ead844fc097a7d16c0abff4c12e856c0b325f231820fee1f39da", size = 5264880 }, - { url = "https://files.pythonhosted.org/packages/26/9d/e03d857d1347fa5ed9247e123fcd2a97b6220e15e9cb73ca0a8d91702c6e/pillow-12.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5dae5f21afb91322f2ff791895ddd8889e5e947ff59f71b46041c8ce6db790bc", size = 4660616 }, - { url = "https://files.pythonhosted.org/packages/f7/ec/8a6d22afd02570d30954e043f09c32772bfe143ba9285e2fdb11284952cd/pillow-12.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e0c664be47252947d870ac0d327fea7e63985a08794758aa8af5b6cb6ec0c9c", size = 6269008 }, - { url = "https://files.pythonhosted.org/packages/3d/1d/6d875422c9f28a4a361f495a5f68d9de4a66941dc2c619103ca335fa6446/pillow-12.1.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:691ab2ac363b8217f7d31b3497108fb1f50faab2f75dfb03284ec2f217e87bf8", size = 8073226 }, - { url = "https://files.pythonhosted.org/packages/a1/cd/134b0b6ee5eda6dc09e25e24b40fdafe11a520bc725c1d0bbaa5e00bf95b/pillow-12.1.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9e8064fb1cc019296958595f6db671fba95209e3ceb0c4734c9baf97de04b20", size = 6380136 }, - { url = "https://files.pythonhosted.org/packages/7a/a9/7628f013f18f001c1b98d8fffe3452f306a70dc6aba7d931019e0492f45e/pillow-12.1.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:472a8d7ded663e6162dafdf20015c486a7009483ca671cece7a9279b512fcb13", size = 7067129 }, - { url = "https://files.pythonhosted.org/packages/1e/f8/66ab30a2193b277785601e82ee2d49f68ea575d9637e5e234faaa98efa4c/pillow-12.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:89b54027a766529136a06cfebeecb3a04900397a3590fd252160b888479517bf", size = 6491807 }, - { url = "https://files.pythonhosted.org/packages/da/0b/a877a6627dc8318fdb84e357c5e1a758c0941ab1ddffdafd231983788579/pillow-12.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:86172b0831b82ce4f7877f280055892b31179e1576aa00d0df3bb1bbf8c3e524", size = 7190954 }, - { url = "https://files.pythonhosted.org/packages/83/43/6f732ff85743cf746b1361b91665d9f5155e1483817f693f8d57ea93147f/pillow-12.1.1-cp313-cp313t-win32.whl", hash = "sha256:44ce27545b6efcf0fdbdceb31c9a5bdea9333e664cda58a7e674bb74608b3986", size = 6336441 }, - { url = "https://files.pythonhosted.org/packages/3b/44/e865ef3986611bb75bfabdf94a590016ea327833f434558801122979cd0e/pillow-12.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a285e3eb7a5a45a2ff504e31f4a8d1b12ef62e84e5411c6804a42197c1cf586c", size = 7045383 }, - { url = "https://files.pythonhosted.org/packages/a8/c6/f4fb24268d0c6908b9f04143697ea18b0379490cb74ba9e8d41b898bd005/pillow-12.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cc7d296b5ea4d29e6570dabeaed58d31c3fea35a633a69679fb03d7664f43fb3", size = 2456104 }, - { url = "https://files.pythonhosted.org/packages/03/d0/bebb3ffbf31c5a8e97241476c4cf8b9828954693ce6744b4a2326af3e16b/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:417423db963cb4be8bac3fc1204fe61610f6abeed1580a7a2cbb2fbda20f12af", size = 4062652 }, - { url = "https://files.pythonhosted.org/packages/2d/c0/0e16fb0addda4851445c28f8350d8c512f09de27bbb0d6d0bbf8b6709605/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:b957b71c6b2387610f556a7eb0828afbe40b4a98036fc0d2acfa5a44a0c2036f", size = 4138823 }, - { url = "https://files.pythonhosted.org/packages/6b/fb/6170ec655d6f6bb6630a013dd7cf7bc218423d7b5fa9071bf63dc32175ae/pillow-12.1.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:097690ba1f2efdeb165a20469d59d8bb03c55fb6621eb2041a060ae8ea3e9642", size = 3601143 }, - { url = "https://files.pythonhosted.org/packages/59/04/dc5c3f297510ba9a6837cbb318b87dd2b8f73eb41a43cc63767f65cb599c/pillow-12.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2815a87ab27848db0321fb78c7f0b2c8649dee134b7f2b80c6a45c6831d75ccd", size = 5266254 }, - { url = "https://files.pythonhosted.org/packages/05/30/5db1236b0d6313f03ebf97f5e17cda9ca060f524b2fcc875149a8360b21c/pillow-12.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f7ed2c6543bad5a7d5530eb9e78c53132f93dfa44a28492db88b41cdab885202", size = 4657499 }, - { url = "https://files.pythonhosted.org/packages/6f/18/008d2ca0eb612e81968e8be0bbae5051efba24d52debf930126d7eaacbba/pillow-12.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:652a2c9ccfb556235b2b501a3a7cf3742148cd22e04b5625c5fe057ea3e3191f", size = 6232137 }, - { url = "https://files.pythonhosted.org/packages/70/f1/f14d5b8eeb4b2cd62b9f9f847eb6605f103df89ef619ac68f92f748614ea/pillow-12.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d6e4571eedf43af33d0fc233a382a76e849badbccdf1ac438841308652a08e1f", size = 8042721 }, - { url = "https://files.pythonhosted.org/packages/5a/d6/17824509146e4babbdabf04d8171491fa9d776f7061ff6e727522df9bd03/pillow-12.1.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b574c51cf7d5d62e9be37ba446224b59a2da26dc4c1bb2ecbe936a4fb1a7cb7f", size = 6347798 }, - { url = "https://files.pythonhosted.org/packages/d1/ee/c85a38a9ab92037a75615aba572c85ea51e605265036e00c5b67dfafbfe2/pillow-12.1.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a37691702ed687799de29a518d63d4682d9016932db66d4e90c345831b02fb4e", size = 7039315 }, - { url = "https://files.pythonhosted.org/packages/ec/f3/bc8ccc6e08a148290d7523bde4d9a0d6c981db34631390dc6e6ec34cacf6/pillow-12.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f95c00d5d6700b2b890479664a06e754974848afaae5e21beb4d83c106923fd0", size = 6462360 }, - { url = "https://files.pythonhosted.org/packages/f6/ab/69a42656adb1d0665ab051eec58a41f169ad295cf81ad45406963105408f/pillow-12.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:559b38da23606e68681337ad74622c4dbba02254fc9cb4488a305dd5975c7eeb", size = 7165438 }, - { url = "https://files.pythonhosted.org/packages/02/46/81f7aa8941873f0f01d4b55cc543b0a3d03ec2ee30d617a0448bf6bd6dec/pillow-12.1.1-cp314-cp314-win32.whl", hash = "sha256:03edcc34d688572014ff223c125a3f77fb08091e4607e7745002fc214070b35f", size = 6431503 }, - { url = "https://files.pythonhosted.org/packages/40/72/4c245f7d1044b67affc7f134a09ea619d4895333d35322b775b928180044/pillow-12.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:50480dcd74fa63b8e78235957d302d98d98d82ccbfac4c7e12108ba9ecbdba15", size = 7176748 }, - { url = "https://files.pythonhosted.org/packages/e4/ad/8a87bdbe038c5c698736e3348af5c2194ffb872ea52f11894c95f9305435/pillow-12.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:5cb1785d97b0c3d1d1a16bc1d710c4a0049daefc4935f3a8f31f827f4d3d2e7f", size = 2544314 }, - { url = "https://files.pythonhosted.org/packages/6c/9d/efd18493f9de13b87ede7c47e69184b9e859e4427225ea962e32e56a49bc/pillow-12.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1f90cff8aa76835cba5769f0b3121a22bd4eb9e6884cfe338216e557a9a548b8", size = 5268612 }, - { url = "https://files.pythonhosted.org/packages/f8/f1/4f42eb2b388eb2ffc660dcb7f7b556c1015c53ebd5f7f754965ef997585b/pillow-12.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1f1be78ce9466a7ee64bfda57bdba0f7cc499d9794d518b854816c41bf0aa4e9", size = 4660567 }, - { url = "https://files.pythonhosted.org/packages/01/54/df6ef130fa43e4b82e32624a7b821a2be1c5653a5fdad8469687a7db4e00/pillow-12.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:42fc1f4677106188ad9a55562bbade416f8b55456f522430fadab3cef7cd4e60", size = 6269951 }, - { url = "https://files.pythonhosted.org/packages/a9/48/618752d06cc44bb4aae8ce0cd4e6426871929ed7b46215638088270d9b34/pillow-12.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98edb152429ab62a1818039744d8fbb3ccab98a7c29fc3d5fcef158f3f1f68b7", size = 8074769 }, - { url = "https://files.pythonhosted.org/packages/c3/bd/f1d71eb39a72fa088d938655afba3e00b38018d052752f435838961127d8/pillow-12.1.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d470ab1178551dd17fdba0fef463359c41aaa613cdcd7ff8373f54be629f9f8f", size = 6381358 }, - { url = "https://files.pythonhosted.org/packages/64/ef/c784e20b96674ed36a5af839305f55616f8b4f8aa8eeccf8531a6e312243/pillow-12.1.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6408a7b064595afcab0a49393a413732a35788f2a5092fdc6266952ed67de586", size = 7068558 }, - { url = "https://files.pythonhosted.org/packages/73/cb/8059688b74422ae61278202c4e1ad992e8a2e7375227be0a21c6b87ca8d5/pillow-12.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5d8c41325b382c07799a3682c1c258469ea2ff97103c53717b7893862d0c98ce", size = 6493028 }, - { url = "https://files.pythonhosted.org/packages/c6/da/e3c008ed7d2dd1f905b15949325934510b9d1931e5df999bb15972756818/pillow-12.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c7697918b5be27424e9ce568193efd13d925c4481dd364e43f5dff72d33e10f8", size = 7191940 }, - { url = "https://files.pythonhosted.org/packages/01/4a/9202e8d11714c1fc5951f2e1ef362f2d7fbc595e1f6717971d5dd750e969/pillow-12.1.1-cp314-cp314t-win32.whl", hash = "sha256:d2912fd8114fc5545aa3a4b5576512f64c55a03f3ebcca4c10194d593d43ea36", size = 6438736 }, - { url = "https://files.pythonhosted.org/packages/f3/ca/cbce2327eb9885476b3957b2e82eb12c866a8b16ad77392864ad601022ce/pillow-12.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:4ceb838d4bd9dab43e06c363cab2eebf63846d6a4aeaea283bbdfd8f1a8ed58b", size = 7182894 }, - { url = "https://files.pythonhosted.org/packages/ec/d2/de599c95ba0a973b94410477f8bf0b6f0b5e67360eb89bcb1ad365258beb/pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334", size = 2546446 }, - { url = "https://files.pythonhosted.org/packages/56/11/5d43209aa4cb58e0cc80127956ff1796a68b928e6324bbf06ef4db34367b/pillow-12.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:600fd103672b925fe62ed08e0d874ea34d692474df6f4bf7ebe148b30f89f39f", size = 5228606 }, - { url = "https://files.pythonhosted.org/packages/5f/d5/3b005b4e4fda6698b371fa6c21b097d4707585d7db99e98d9b0b87ac612a/pillow-12.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:665e1b916b043cef294bc54d47bf02d87e13f769bc4bc5fa225a24b3a6c5aca9", size = 4622321 }, - { url = "https://files.pythonhosted.org/packages/df/36/ed3ea2d594356fd8037e5a01f6156c74bc8d92dbb0fa60746cc96cabb6e8/pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:495c302af3aad1ca67420ddd5c7bd480c8867ad173528767d906428057a11f0e", size = 5247579 }, - { url = "https://files.pythonhosted.org/packages/54/9a/9cc3e029683cf6d20ae5085da0dafc63148e3252c2f13328e553aaa13cfb/pillow-12.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8fd420ef0c52c88b5a035a0886f367748c72147b2b8f384c9d12656678dfdfa9", size = 6989094 }, - { url = "https://files.pythonhosted.org/packages/00/98/fc53ab36da80b88df0967896b6c4b4cd948a0dc5aa40a754266aa3ae48b3/pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f975aa7ef9684ce7e2c18a3aa8f8e2106ce1e46b94ab713d156b2898811651d3", size = 5313850 }, - { url = "https://files.pythonhosted.org/packages/30/02/00fa585abfd9fe9d73e5f6e554dc36cc2b842898cbfc46d70353dae227f8/pillow-12.1.1-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8089c852a56c2966cf18835db62d9b34fef7ba74c726ad943928d494fa7f4735", size = 5963343 }, - { url = "https://files.pythonhosted.org/packages/f2/26/c56ce33ca856e358d27fda9676c055395abddb82c35ac0f593877ed4562e/pillow-12.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:cb9bb857b2d057c6dfc72ac5f3b44836924ba15721882ef103cecb40d002d80e", size = 7029880 }, -] - [[package]] name = "platformdirs" version = "4.9.2" @@ -2806,15 +2349,6 @@ crypto = [ { name = "cryptography" }, ] -[[package]] -name = "pyparsing" -version = "3.3.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f3/91/9c6ee907786a473bf81c5f53cf703ba0957b23ab84c264080fb5a450416f/pyparsing-3.3.2.tar.gz", hash = "sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc", size = 6851574 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781 }, -] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3282,20 +2816,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/07/39/338d9219c4e87f3e708f18857ecd24d22a0c3094752393319553096b98af/scipy-1.17.1-cp314-cp314t-win_arm64.whl", hash = "sha256:200e1050faffacc162be6a486a984a0497866ec54149a01270adc8a59b7c7d21", size = 25489165 }, ] -[[package]] -name = "seaborn" -version = "0.13.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "matplotlib" }, - { name = "numpy" }, - { name = "pandas" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914 }, -] - [[package]] name = "semver" version = "3.0.4" @@ -3596,8 +3116,8 @@ name = "uvicorn" version = "0.41.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click", marker = "sys_platform != 'emscripten'" }, - { name = "h11", marker = "sys_platform != 'emscripten'" }, + { name = "click" }, + { name = "h11" }, ] sdist = { url = "https://files.pythonhosted.org/packages/32/ce/eeb58ae4ac36fe09e3842eb02e0eb676bf2c53ae062b98f1b2531673efdd/uvicorn-0.41.0.tar.gz", hash = "sha256:09d11cf7008da33113824ee5a1c6422d89fbc2ff476540d69a34c87fab8b571a", size = 82633 } wheels = [ @@ -3923,7 +3443,7 @@ name = "zipfile-zstd" version = "0.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zstandard", marker = "python_full_version < '3.14'" }, + { name = "zstandard" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f7/2a/2e0941bc0058d10ab37d8c578b94a19f611f6ae54f124140f2fb451f0932/zipfile-zstd-0.0.4.tar.gz", hash = "sha256:c1498e15b7922a3d1af0ea55df8b11b2af4e8f7e0e80e414e25d66899f7def89", size = 4603 } wheels = [ From 6cad3976203a6f85cdeecf3751c9259b39e09e3a Mon Sep 17 00:00:00 2001 From: Dany Haddad Date: Wed, 1 Apr 2026 16:12:04 +0000 Subject: [PATCH 13/13] Update provider deps for new model support --- pyproject.toml | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 01d37c2f..e1c6c1f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,12 +10,12 @@ requires-python = ">=3.11" dependencies = [ "inspect_ai==0.3.114", "agent-eval[leaderboard]==0.1.46", - "openai>=1.78.0", # required by inspect + "openai>=2.30.0", # provider SDK version validated with the current model set "pydantic>=2.11.4", # required by inspect "litellm==1.82.3", "datasets~=3.2.0", "huggingface_hub", - "google-genai>=1.16.1", + "google-genai>=1.70.0", "nltk", "tabulate", "click==8.1.8", @@ -26,7 +26,7 @@ dependencies = [ "h2~=4.2.0", "pandas", "scipy", - "anthropic>=0.52.0", + "anthropic>=0.87.0", "platformdirs", "numpy", ] @@ -64,14 +64,6 @@ conflicts = [ [{extra = "sqa"}, {extra = "futurehouse"}], [{extra = "storm"}, {extra = "smolagents"}], ] -override-dependencies = [ - # sqa pins openai to a lower version than inspect requires - "openai==1.78.0", - - # STORM pretends to require a lower version, but doesn't actually need it: - # https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008 - "datasets~=3.2.0", -] [tool.flake8] select = [