diff --git a/astabench/evals/sqa/retry_utils.py b/astabench/evals/sqa/retry_utils.py index c00ee24a..1ad355f4 100644 --- a/astabench/evals/sqa/retry_utils.py +++ b/astabench/evals/sqa/retry_utils.py @@ -4,7 +4,7 @@ import logging import sqlite3 from datetime import datetime, timedelta -from typing import Any, Dict, Optional, Union, List, Tuple +from typing import Any, Callable, Dict, Optional, Union, List, Tuple from inspect_ai.model import Model, GenerateConfig @@ -19,6 +19,7 @@ async def generate_with_retry( max_retries: int = 20, base_delay: float = 2.0, desired_schema=None, + parsed_validator: Optional[Callable[[Dict[str, Any]], None]] = None, ) -> Any: """ Generate response with retry logic and optional JSON parsing. @@ -57,6 +58,8 @@ async def generate_with_retry( if desired_schema: parsed = desired_schema(**parsed) parsed = parsed.model_dump(mode="json") + if parsed_validator: + parsed_validator(parsed) return result, parsed, attempt else: diff --git a/astabench/evals/sqa/rubric.py b/astabench/evals/sqa/rubric.py index 6e03d04b..9e2bd37e 100644 --- a/astabench/evals/sqa/rubric.py +++ b/astabench/evals/sqa/rubric.py @@ -368,6 +368,47 @@ async def _assess_properties_independently(self, response, properties): score_components[x.name] = assessment return score_components, prompt_logs + @staticmethod + def _validate_joint_assessment_payload( + parsed: Dict[str, Any], expected_criteria_count: int + ) -> None: + scores = parsed.get("scores") + if not isinstance(scores, list): + raise ValueError("Joint rubric scorer output is missing a 'scores' list") + + seen_indices = [] + for score in scores: + if not isinstance(score, dict): + raise ValueError( + "Joint rubric scorer output contains a non-object score" + ) + criteria_idx = score.get("criteria_idx") + if not isinstance(criteria_idx, int): + raise ValueError( + "Joint rubric scorer output contains a non-integer criteria_idx" + ) + if criteria_idx < 1 or criteria_idx > expected_criteria_count: + raise ValueError( + f"Joint rubric scorer output contains out-of-range criteria_idx={criteria_idx}" + ) + seen_indices.append(criteria_idx) + + expected_indices = set(range(1, expected_criteria_count + 1)) + actual_indices = set(seen_indices) + if ( + actual_indices != expected_indices + or len(seen_indices) != expected_criteria_count + ): + missing = sorted(expected_indices - actual_indices) + duplicate = sorted( + {idx for idx in seen_indices if seen_indices.count(idx) > 1} + ) + raise ValueError( + "Joint rubric scorer output did not cover every criterion exactly once. " + f"expected={expected_criteria_count} actual={len(seen_indices)} " + f"missing={missing} duplicate={duplicate}" + ) + async def _assess_properties_jointly(self, response, properties): info = { "step_name": "score_property", @@ -461,6 +502,10 @@ async def _assess_properties_jointly(self, response, properties): temperature=self.temperature, top_p=self.top_p, ), + desired_schema=ResponseCriteriaScores, + parsed_validator=lambda parsed: self._validate_joint_assessment_payload( + parsed, len(has_criterion) + ), ) info["system_prompt"] = system_prompt info["user_prompt"] = user_prompt @@ -508,7 +553,15 @@ async def score_output_simplified( ) score_components.update(assessments) - assert set(score_components.keys()) == set(score_weights.keys()) + if set(score_components.keys()) != set(score_weights.keys()): + missing = sorted(set(score_weights.keys()) - set(score_components.keys())) + unexpected = sorted( + set(score_components.keys()) - set(score_weights.keys()) + ) + raise ValueError( + "Simplified rubric scoring produced mismatched components. " + f"missing={missing} unexpected={unexpected}" + ) ann_score = sum( score_weights[key] * score_components[key] for key in score_weights ) diff --git a/dev_dvc_logs/debug_logs/.gitignore b/dev_dvc_logs/debug_logs/.gitignore index d035a97e..5707ecfd 100644 --- a/dev_dvc_logs/debug_logs/.gitignore +++ b/dev_dvc_logs/debug_logs/.gitignore @@ -16,3 +16,12 @@ /task_sqa_solver_openscholar_rubric_eval.csv /task_sqa_solver_openscholar_citation_eval.csv /task_sqa_solver_openscholar_answer_precision_eval.csv +/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv +/task_sqa_solver_sqa_claude-4.6_citation_eval.csv +/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv +/task_sqa_solver_sqa_o3_high_rubric_eval.csv +/task_sqa_solver_sqa_o3_high_citation_eval.csv +/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv +/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv diff --git a/dvc.lock b/dvc.lock index 53110fd4..ba3ee635 100644 --- a/dvc.lock +++ b/dvc.lock @@ -594,13 +594,13 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver - -T split=dev -T scorer_model=google/gemini-2.5-pro --limit=1000 --log-shared + -T split=dev -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 @@ -644,18 +644,18 @@ stages: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=dev - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 8d08ed8249687b3d41a8e06d7533c852 - size: 4949142 + md5: 336b302907ace455f3b7457eaaee68b2 + size: 4445686 score_all_solvers@anthropic/claude-3-5-sonnet-20240620-test: cmd: echo "Scoring";[[ "anthropic/claude-3-5-sonnet-20240620" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620; @@ -816,88 +816,85 @@ stages: md5: 10496d398144a7a41e639823823d007f size: 1068696 solve_llm@model4-test: - cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-3-7-sonnet-20250219 - uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-3-7-sonnet-20250219 -T split=test -T scorer_model=google/gemini-2.5-pro + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o3 uv run --extra sqa + inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model openai/o3 --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false + --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "anthropic/claude-3-7-sonnet-20250219" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "anthropic/claude-3-7-sonnet-20250219"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-3-7-sonnet-20250219.eval - 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval" + [[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "openai/o3"); mv "$(ls -t test_dvc_logs/solver_outputs/*openai/o3.eval 2>/dev/null + | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval hash: md5 - md5: a8fbef7fa3689f6859d79611e64981ed - size: 1850390 + md5: 5857c63cdab2b62a6f907c9a27a811ec + size: 17943556 solve_llm@model1-test: cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking + INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6-thinking uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-sonnet-4-20250514 --reasoning-tokens 8192 -T split=test - -T scorer_model=google/gemini-2.5-pro -T excerpt_prompt=False --limit=1000 --retry-on-error=10 - --log-shared --no-score; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && - mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514"); - mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-20250514-thinking.eval + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model anthropic/claude-sonnet-4-6 --reasoning-tokens 8192 -T split=test -T + scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 + --retry-on-error=10 --log-shared --no-score; [[ "anthropic/claude-sonnet-4-6" + == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "anthropic/claude-sonnet-4-6"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-6-thinking.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval" + "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 16646547ee30427da219b0983ca788de - size: 1923240 + md5: 3107b1f474692f8919a571cee6e6cb3f + size: 14844232 solve_llm@model3-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-20250514 + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_anthropic/claude-sonnet-4-6 uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model anthropic/claude-sonnet-4-20250514 -T split=test -T scorer_model=google/gemini-2.5-pro + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model anthropic/claude-sonnet-4-6 -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "anthropic/claude-sonnet-4-20250514"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-20250514.eval - 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval" + [[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "anthropic/claude-sonnet-4-6"); mv "$(ls -t test_dvc_logs/solver_outputs/*anthropic/claude-sonnet-4-6.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: c509c6bcdd7f0692029d00e3f6187139 - size: 8161774 + md5: e521c24cb487f904c7d9aa56e8ab9ff9 + size: 13576196 solve_memorized@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run - --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver - -T scorer_model=google/gemini-2.5-flash -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 + --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver + agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + -T scorer_model=google/gemini-3-flash-preview -T split=test -S sys_name_or_path=openai_deep_research --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openai_deep_research.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0ca0403d04913c8d05c012bb013ed19e - size: 23480196 + md5: e64efa83d59f1e39f5f44f7472b7f92d + size: 20770084 solve_memorized@model1-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ @@ -917,41 +914,39 @@ stages: md5: 1444e580204b243a0ca753f3c13bb97e size: 8435874 solve_llm@model2-test: - cmd: - INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_google/gemini-2.5-pro-preview-03-25 + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_google/gemini-3.1-pro-preview uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain - --log-dir test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver - --model google/gemini-2.5-pro-preview-03-25 -T split=test -T scorer_model=google/gemini-2.5-pro + --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model google/gemini-3.1-pro-preview -T split=test -T scorer_model=google/gemini-3-flash-preview -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; - [[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname - "google/gemini-2.5-pro-preview-03-25"); mv "$(ls -t test_dvc_logs/solver_outputs/*google/gemini-2.5-pro-preview-03-25.eval + [[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + "google/gemini-3.1-pro-preview"); mv "$(ls -t test_dvc_logs/solver_outputs/*google/gemini-3.1-pro-preview.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval" + "test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 8ab9ea1ffa6fc82f71e82a796fae1ab1 - size: 2603396 + md5: 9270f1cbd5f606bd646579fe524ae0f4 + size: 22505579 solve_llm@model0-test: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai/o4-mini uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver --model openai/o4-mini - --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false --reasoning-history - none -T split=test -T scorer_model=google/gemini-2.5-pro -T excerpt_prompt=False - --limit=1000 --retry-on-error=10 --log-shared --no-score; [[ "openai/o4-mini" - == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver + --model openai/o4-mini --reasoning-effort high --reasoning-tokens 8192 -M responses_store=false + --reasoning-history none -T split=test -T scorer_model=google/gemini-3-flash-preview + -T excerpt_prompt=False --limit=1000 --retry-on-error=10 --log-shared --no-score; + [[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/solver_outputs/task_sqa_solver_$(dirname "openai/o4-mini"); mv "$(ls -t test_dvc_logs/solver_outputs/*openai/o4-mini.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval @@ -1014,21 +1009,22 @@ stages: md5: 8e144b969d037a4509f5a8321d5f4092 size: 13573997 solve_elicit@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=test - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval 2>/dev/null - | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver + agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared + --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_elicit.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 1fc7402e0fd42a53863c6a6f859b9fa8 - size: 4895087 + md5: eaf8f155d177afe6c3fb8b5913324458 + size: 15975606 score_all_solvers@elicit-test: cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; @@ -1208,57 +1204,57 @@ stages: md5: 26bbc824e8055613e796a159b16932af size: 1171464 solve_memorized@model0-dev: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver - -T scorer_model=google/gemini-2.5-pro -T split=dev -S sys_name_or_path=fhouse_crow - -S require_snippets=false --limit=1000 --retry-on-error=10 --log-shared --no-score; - mv "$(ls -t dev_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head - -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S sys_name_or_path=openai_deep_research --limit=1000 + --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*openai_deep_research.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 4ff53a9992d562521979130e3bb36fcf - size: 1195651 + md5: b201f2f7bd9c83f6e3c901805f385d44 + size: 1195688 solve_perplexity_dr@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver - astabench/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=test - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ -T with_search_tools=False --model + 'perplexity/sonar-deep-research' --solver + agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared + --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 253e478e5891269aaf0b598a9320fff9 - size: 1784027 + md5: 4b19505fd7cab3fd73e60f065c57ac7c + size: 52745393 solve_perplexity_dr@dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ -T with_search_tools=False --model 'perplexity/sonar-deep-research' --solver astabench/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=dev - -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared --no-score; + -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_perplexity_dr.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 13761e458ca6d9fd68b66975a2b1c3be - size: 1187381 + md5: 8965cf43f6dc2e9a401b4d286637445d + size: 845996 solve_memorized@model2-dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openai_deep_research uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir @@ -1366,8 +1362,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai/o4-mini.md hash: md5 @@ -1375,7 +1371,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai/o4-mini.md hash: md5 - md5: 51a6ebb6302c43db2ee01e0271b39377 + md5: cb895dc77c974183e26442188fc736b6 size: 254 log_any_remaining_errors_and_record_scores@google/gemini-2.5-pro-preview-03-25-test: cmd: echo "Collecting errors";[[ "google/gemini-2.5-pro-preview-03-25" == */* @@ -1544,8 +1540,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 outs: - path: test_dvc_logs/errors/task_sqa_solver_elicit.md hash: md5 @@ -1553,7 +1549,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_elicit.md hash: md5 - md5: 049dad5aaf612704ca160fdd49710a3e + md5: d8b5a93675f21c1eb409178be0c641c7 size: 259 log_any_remaining_errors_and_record_scores@storm-dev: cmd: echo "Collecting errors";[[ "storm" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1582,8 +1578,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 outs: - path: test_dvc_logs/errors/task_sqa_solver_scispace.md hash: md5 @@ -1591,7 +1587,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_scispace.md hash: md5 - md5: 3047259800df7c377569f4f7e125ae12 + md5: bd720bd2a3d2187b39848bcf6581ab4f size: 255 log_any_remaining_errors_and_record_scores@fhouse_crow-test: cmd: echo "Collecting errors";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1601,8 +1597,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: c46c37260737c0dc447ea49d194a1646 - size: 16755260 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 outs: - path: test_dvc_logs/errors/task_sqa_solver_fhouse_crow.md hash: md5 @@ -1610,8 +1606,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_fhouse_crow.md hash: md5 - md5: 7d50a7a2afb1c892fd0b9a7172ba4e67 - size: 264 + md5: b9a9b82333f170a027755e52a73305f8 + size: 274 log_any_remaining_errors_and_record_scores@fhouse_falcon-dev: cmd: echo "Collecting errors";[[ "fhouse_falcon" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "fhouse_falcon"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "fhouse_falcon"); @@ -1639,8 +1635,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 outs: - path: test_dvc_logs/errors/task_sqa_solver_fhouse_falcon.md hash: md5 @@ -1648,8 +1644,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_fhouse_falcon.md hash: md5 - md5: 24dfaa60688754273b11987ce25e8050 - size: 265 + md5: 0115f8e85682c50b1fbef8f0fa10d512 + size: 275 log_any_remaining_errors_and_record_scores@openai_deep_research-dev: cmd: echo "Collecting errors";[[ "openai_deep_research" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "openai_deep_research"); mkdir @@ -1679,8 +1675,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0b9c6893461a6a71200faca09d42cf0b - size: 30114985 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai_deep_research.md hash: md5 @@ -1688,7 +1684,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai_deep_research.md hash: md5 - md5: 7edf8195b55073641d2e3baa95574894 + md5: 09fd9c4363a1ad201d5df801e983ef4c size: 255 log_any_remaining_errors_and_record_scores@perplexity_dr-test: cmd: echo "Collecting errors";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname @@ -1698,8 +1694,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: eb8e7ffc96608f726ce82345c217179f - size: 3783641 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 outs: - path: test_dvc_logs/errors/task_sqa_solver_perplexity_dr.md hash: md5 @@ -1707,8 +1703,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_perplexity_dr.md hash: md5 - md5: 3341f9837634fd03ab2bb537b2b6800b - size: 273 + md5: 5bd74f0abd0a5426f922b5474003d183 + size: 262 extract_model_responses@sqa_claude-3.7-dev: cmd: echo "Extracting responses"; [[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_claude-3.7"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval @@ -2032,24 +2028,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_answer_precision_eval.csv hash: md5 - md5: c8f550d2e3813a2d4bd73428531249cd - size: 27561214 + md5: 6fe55584fb19caabeb2f83b4711affdc + size: 26795074 - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_citation_eval.csv hash: md5 - md5: d76ef432235de34e9508d1327f5604dd - size: 100932205 + md5: 0c5f13a89d433dea784d304ddf5ac621 + size: 101106612 - path: test_dvc_logs/debug_logs/task_sqa_solver_elicit_rubric_eval.csv hash: md5 - md5: 9dd49e51140b72b4868d9ef01b4d7eeb - size: 22861958 + md5: b2dfcdb37c2e5795ac0b6c1aee816c22 + size: 22438331 create_nice_logs@storm-dev: cmd: echo "Creating logs"; [[ "storm" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname "storm"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_storm.eval @@ -2082,24 +2078,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 05ff5ed44795c23f8d31b83a559e3ac0 - size: 2964980 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_answer_precision_eval.csv hash: md5 - md5: 989c4fc1aa6b57445b33547bd6bc36a6 - size: 3132753 + md5: 62c13d95dc2f072ce3bd7e1d8444de21 + size: 7565596 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_citation_eval.csv hash: md5 - md5: 6bbd8e26a754b4950a5fdaded0e6fb51 - size: 10221124 + md5: 72ab5aa46f1b01186cb2b1f4b9847daf + size: 20315492 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_crow_rubric_eval.csv hash: md5 - md5: 7d6bd35aa25ce84835aafdf813790d17 - size: 3795058 + md5: f9302743cdca7f064e531d9ae4fed237 + size: 9200221 create_nice_logs@fhouse_falcon-test: cmd: echo "Creating logs"; [[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "fhouse_falcon"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2107,24 +2103,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 0aad0b8d622b09e4ab9812b941beefbc - size: 14628117 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_answer_precision_eval.csv hash: md5 - md5: e64bb7613ce2606912102a937ef42fe7 - size: 13252236 + md5: 37db319201463d9c0d95944f3c36bb10 + size: 33876044 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_citation_eval.csv hash: md5 - md5: 5b15b9771795e7c973a38a2ae3057477 - size: 108591713 + md5: cb0517c0a218b96bacaa16e96a864b4b + size: 347133727 - path: test_dvc_logs/debug_logs/task_sqa_solver_fhouse_falcon_rubric_eval.csv hash: md5 - md5: 7a2a72041e33fd16700d4c1d4b731ea0 - size: 12309141 + md5: 174bc8c7cf47e1bc369220fbdede4d24 + size: 33308962 create_nice_logs@openai_deep_research-test: cmd: echo "Creating logs"; [[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openai_deep_research"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2132,25 +2128,25 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: 0b9c6893461a6a71200faca09d42cf0b - size: 30114985 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_answer_precision_eval.csv hash: md5 - md5: ed42a270972b76b6892fadd423a4d146 - size: 23972174 + md5: a2344ebb69bb62fcb3631e9ec7ebee05 + size: 13769615 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_citation_eval.csv hash: md5 - md5: 152e59ee8bfda0ccc4fd2bb35dbd44a6 - size: 128934273 + md5: cc030bf5e8d82ec17a14fb3f13dd7e6a + size: 74932006 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai_deep_research_rubric_eval.csv hash: md5 - md5: 72484abe43058939f9003f8df89322c0 - size: 22888136 + md5: 6ab5c8a01c4fa86ff90f47051d3cc531 + size: 13566123 create_nice_logs@perplexity_dr-test: cmd: echo "Creating logs"; [[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "perplexity_dr"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -2158,40 +2154,40 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: eb8e7ffc96608f726ce82345c217179f - size: 3783641 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_answer_precision_eval.csv hash: md5 - md5: f8504e5f0c7aef86b9f785c44f1385f2 - size: 2380106 + md5: d9525de6997367f68baf41a9a319e6a9 + size: 21877548 - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_citation_eval.csv hash: md5 - md5: b80d4fa20bac3e0d5dc9e2ba8fe50588 - size: 6825224 + md5: 855e1fbb1fe3b7143bb80b03ebf877b9 + size: 138212606 - path: test_dvc_logs/debug_logs/task_sqa_solver_perplexity_dr_rubric_eval.csv hash: md5 - md5: b45f5856e3195800b6a4f18ab0fa687e - size: 4371131 + md5: c5eaf89b3a46a1082be673efa7e46acf + size: 19231579 solve_you@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect - eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' - -T split=test -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared - --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --project agent-baselines + --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir + test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver + -S api_type='research' -T split=test -T scorer_model=google/gemini-3-flash-preview + --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_you.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: e7be9b73b33e99893b170afb853a21ae - size: 1731090 + md5: dffe67db7f27087923ec47f447e5006f + size: 3981342 create_nice_logs@google/gemini-2.5-pro-preview-03-25-test: cmd: echo "Creating logs"; [[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "google/gemini-2.5-pro-preview-03-25"); @@ -2247,35 +2243,35 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_you_answer_precision_eval.csv hash: md5 - md5: 3cfa316dbb65a767f448672b4f21f903 - size: 3599826 + md5: 20398962c534bdf847f44d45d5c0c739 + size: 3642395 - path: test_dvc_logs/debug_logs/task_sqa_solver_you_citation_eval.csv hash: md5 - md5: e349b629bc72a358f5518366d3d1ea84 - size: 11149105 + md5: ac233a5d818e4b248bcbf800dcdd0ad4 + size: 11270935 - path: test_dvc_logs/debug_logs/task_sqa_solver_you_rubric_eval.csv hash: md5 - md5: 34c9628df1883bcacca0536d907d162c - size: 5212658 + md5: 577c2c58b07bb00653f527ad115b3f40 + size: 5000608 solve_you@dev: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you uv run --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' - -T split=dev -T scorer_model=google/gemini-2.5-pro --limit=1000 --log-shared + -T split=dev -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*task_sqa_solver_you.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-pro + scorer_model: google/gemini-3-flash-preview outs: - path: dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 @@ -2361,8 +2357,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 outs: - path: test_dvc_logs/errors/task_sqa_solver_you.md hash: md5 @@ -2370,7 +2366,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_you.md hash: md5 - md5: 92b92fe4376c3460057d3b906a5bc6ec + md5: f9b4f385282e3fb74b73d5711b26c068 size: 255 log_any_remaining_errors_and_record_scores@you-dev: cmd: echo "Collecting errors";[[ "you" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname @@ -2426,21 +2422,21 @@ stages: md5: c527f47dec43ca1a0bec00cb16249e22 size: 7676995 solve_storm@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --extra storm - --python 3.11 inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver - -T split=test -T scorer_model=google/gemini-2.5-flash --limit=1000 --log-shared + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm uv run --project agent-baselines/solvers/storm + --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview --limit=1000 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*task_sqa_solver_storm.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: 21685170723948d1f8d89cf0ed71666d - size: 5802900 + md5: 2a3a5781d41b7d77ade04fe0cc6e8347 + size: 15065365 create_nice_logs@anthropic/claude-sonnet-4-20250514-test: cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514"); @@ -2535,25 +2531,25 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_answer_precision_eval.csv hash: md5 - md5: 0967bccd7b4718d2163cfd617c487d18 - size: 2355952 + md5: a564e7ffb71f64a6b574a89dfd6e7025 + size: 2405714 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_citation_eval.csv hash: md5 - md5: 4419d87bb49fd33f4900f16b40c1401b - size: 9399236 + md5: db82fa2dda4031932aa79c4ce837fbe9 + size: 9411475 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o4-mini_rubric_eval.csv hash: md5 - md5: 536389aee717408941eb0ecc69556cb7 - size: 3063547 + md5: f237d1b0e8447f38e7502ca5c6b423f9 + size: 2876345 create_nice_logs@anthropic/claude-sonnet-4-20250514-thinking-test: cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514-thinking" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514-thinking"); @@ -2776,8 +2772,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: e70ded32703f2c4f02e6d9e6aca2daf6 - size: 15020405 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 outs: - path: test_dvc_logs/errors/task_sqa_solver_storm.md hash: md5 @@ -2785,7 +2781,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_storm.md hash: md5 - md5: ad271c9a6a27255e437aa8f25b128437 + md5: 270eba7766f31f99547266811d9729d4 size: 255 create_nice_logs@storm-test: cmd: echo "Creating logs"; [[ "storm" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname @@ -2794,237 +2790,244 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: e70ded32703f2c4f02e6d9e6aca2daf6 - size: 15020405 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_answer_precision_eval.csv hash: md5 - md5: b88cd9438a76b2dc64efeec6841a7ab8 - size: 16010341 + md5: a9b310a74ddb497809adef3b347f5758 + size: 15887439 - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_citation_eval.csv hash: md5 - md5: 11476181ed7d144a78881722df1417eb - size: 114417509 + md5: 80d8e2a50b39399da1f213c53a33e7ec + size: 114538286 - path: test_dvc_logs/debug_logs/task_sqa_solver_storm_rubric_eval.csv hash: md5 - md5: 5982324c37031fd9317260f7b361b476 - size: 16622239 + md5: 8700cb63d34eb03a9cca1d20a815aac9 + size: 16106436 score_all_solvers@model12-test: - cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; - cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval + cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_falcon; + cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 8726b423e371a6f02ebb8edf044da131 - size: 13192395 + md5: 27f869df37299d6895881c95fb5a38d9 + size: 78478412 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_storm.eval + - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 19a1d2d9827872b4de45bd3070051135 + size: 196232443 score_all_solvers@model8-test: - cmd: echo "Scoring";[[ "sqa_claude-4.0" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.0.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval + cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; + cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.0.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval hash: md5 - md5: 3a3856bb7ba4c7fd8a6a4503ad86d95e - size: 7334514 + md5: eaf8f155d177afe6c3fb8b5913324458 + size: 15975606 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.0.eval + - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval hash: md5 - md5: f370d1cf06ce6a67539e9958993c426d - size: 33357446 + md5: d671bcb2fd90de540c69ef5bcccfb45a + size: 83436121 score_all_solvers@model11-test: - cmd: echo "Scoring";[[ "elicit" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_elicit; - cp test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval test_dvc_logs/scored/task_sqa_solver_elicit.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_elicit.eval + cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow; + cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_elicit.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 1fc7402e0fd42a53863c6a6f859b9fa8 - size: 4895087 + md5: d3c2190ee23a647aed113be022cf072a + size: 17219690 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_elicit.eval + - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: eaf8f155d177afe6c3fb8b5913324458 - size: 15975606 + md5: 02d170907f472a80e8fdc2a4604c57bd + size: 47671586 score_all_solvers@model13-test: - cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace; - cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_scispace.eval + cmd: echo "Scoring";[[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai_deep_research; + cp test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: e64efa83d59f1e39f5f44f7472b7f92d + size: 20770084 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval + - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval hash: md5 - md5: e517fa1055570efb347dd6a924e478ae - size: 46039197 + md5: 522b75668b14dbaa609e5ef4cc5c8f8e + size: 115070053 score_all_solvers@model14-test: - cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow; - cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval + cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_you; + cp test_dvc_logs/solver_outputs/task_sqa_solver_you.eval test_dvc_logs/scored/task_sqa_solver_you.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_you.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_you.eval hash: md5 - md5: 7ce6ee5a36b336b15929a5e35fc3e795 - size: 1443466 + md5: dffe67db7f27087923ec47f447e5006f + size: 3981342 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval + - path: test_dvc_logs/scored/task_sqa_solver_you.eval hash: md5 - md5: d3c2190ee23a647aed113be022cf072a - size: 17219690 + md5: 9e838e47e328b92f7c0e50e39e77acb5 + size: 46907880 score_all_solvers@model6-test: - cmd: echo "Scoring";[[ "anthropic/claude-3-5-sonnet-20240620" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval; uv run + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 10496d398144a7a41e639823823d007f - size: 1068696 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-5-sonnet-20240620.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: 5cfa5f8d8daa3baf20773547034ea86f - size: 2103591 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 score_all_solvers@model10-test: - cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace; + cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_scispace.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: 31ee4f9ea8f0301cb6bd65cdc5cc0d23 + size: 33097436 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 score_all_solvers@model15-test: - cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_falcon; - cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval + cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_perplexity_dr; + cp test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: 4b19505fd7cab3fd73e60f065c57ac7c + size: 52745393 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval + - path: test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval hash: md5 - md5: 27f869df37299d6895881c95fb5a38d9 - size: 78478412 + md5: cb364e482a7e30f59965106a1ee417db + size: 143267910 score_all_solvers@model9-test: - cmd: echo "Scoring";[[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval + cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm; + cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval hash: md5 - md5: ab04845b349e04dae643d06d08ab208f - size: 7031013 + md5: 2a3a5781d41b7d77ade04fe0cc6e8347 + size: 15065365 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval + - path: test_dvc_logs/scored/task_sqa_solver_storm.eval hash: md5 - md5: 533fd1f0d777e38f3fe4bf5088bd1b0f - size: 19418222 + md5: 3be48b34599f244761b63ab3887a0bac + size: 114089951 score_all_solvers@model7-test: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_o3_high; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 123be282c9eafe56c554aa9b5b1e995c - size: 10829064 + md5: 711b54ac018c9dc4f2cc190c3029c42e + size: 56666173 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: c8acd76fb628d608bb72a1f9a6a57572 - size: 29443230 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 score_all_solvers@model16-test: - cmd: echo "Scoring";[[ "openai_deep_research" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai_deep_research; - cp test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval + cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; + cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_openscholar.eval deps: - - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai_deep_research.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 - md5: e7be9b73b33e99893b170afb853a21ae - size: 1731090 + md5: 61ff1d386b502863144b85cc37a0b0a4 + size: 8231693 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_openai_deep_research.eval + - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 97f9ff45622f40bec85ecdf7de74a68f - size: 3947510 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 score_all_solvers@model9-dev: cmd: echo "Scoring";[[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro; cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-2.5-pro.eval dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-2.5-pro.eval; @@ -3064,24 +3067,25 @@ stages: md5: 9f23b21841edbbdbaa9e4a67d4404377 size: 15423266 score_all_solvers@model7-dev: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + cmd: echo "Scoring";[[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval deps: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: f7d2bdc765e3dfdac4098687a48acdb2 - size: 6977126 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: ae4fbca2e9345d6573b8dad158a696bc - size: 17947538 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 score_all_solvers@model16-dev: cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_you; cp dev_dvc_logs/solver_outputs/task_sqa_solver_you.eval dev_dvc_logs/scored/task_sqa_solver_you.eval; @@ -3178,24 +3182,25 @@ stages: md5: d555c72e642f5446c964ceec9228a4ce size: 7663908 score_all_solvers@model6-dev: - cmd: echo "Scoring";[[ "sqa_claude-3.7" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7; - cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + cmd: echo "Scoring";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval; uv run + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval deps: - - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: f7d2bdc765e3dfdac4098687a48acdb2 - size: 6977126 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-3.7.eval + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval hash: md5 - md5: ae4fbca2e9345d6573b8dad158a696bc - size: 17947538 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 score_all_solvers@model14-dev: cmd: echo "Scoring";[[ "fhouse_falcon" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_fhouse_falcon; cp dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval dev_dvc_logs/scored/task_sqa_solver_fhouse_falcon.eval; @@ -3218,8 +3223,9 @@ stages: score_all_solvers@model0-test: cmd: echo "Scoring";[[ "openai/o4-mini" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o4-mini; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o4-mini.eval hash: md5 @@ -3227,127 +3233,117 @@ stages: size: 1088436 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - path: test_dvc_logs/scored/task_sqa_solver_openai/o4-mini.eval hash: md5 - md5: 707491ba3f460b10b740acc4f18fc009 - size: 2620767 + md5: 3528e6d68199489705ef51e91170387c + size: 26134288 score_all_solvers@model5-test: - cmd: echo "Scoring";[[ "anthropic/claude-3-7-sonnet-20250219" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-pro -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + cmd: echo "Scoring";[[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6; + cp test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: a8fbef7fa3689f6859d79611e64981ed - size: 1850390 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 params: params.yaml: - scorer_model: google/gemini-2.5-pro - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-3-7-sonnet-20250219.eval + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval hash: md5 - md5: c284c5abc8f064b1a43eadfb1d743639 - size: 3734749 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 score_all_solvers@model3-test: - cmd: echo "Scoring";[[ "google/gemini-2.5-pro-preview-03-25" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval - test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + cmd: echo "Scoring";[[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview; + cp test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval; uv + run inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 8ab9ea1ffa6fc82f71e82a796fae1ab1 - size: 2603396 + md5: 9270f1cbd5f606bd646579fe524ae0f4 + size: 22505579 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-2.5-pro-preview-03-25.eval + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval hash: md5 - md5: 0b1ca179eb7130488957688c0f1d45b3 - size: 3855948 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 score_all_solvers@model1-test: cmd: echo "Scoring";[[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openai/o3; cp test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval test_dvc_logs/scored/task_sqa_solver_openai/o3.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_openai/o3.eval deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openai/o3.eval hash: md5 - md5: 9b53b134e0185678f65552a598ed13b9 - size: 1558577 + md5: 5857c63cdab2b62a6f907c9a27a811ec + size: 17943556 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 score_all_solvers@model4-test: - cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-20250514" == */* ]] && mkdir - -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514; - cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6; + cp test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval; uv run + inspect score --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all + -S scorer_model=google/gemini-3-flash-preview -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval deps: - - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: c509c6bcdd7f0692029d00e3f6187139 - size: 8161774 + md5: e521c24cb487f904c7d9aa56e8ab9ff9 + size: 13576196 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514.eval + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval hash: md5 - md5: 504299ea10b129ab40af8d9842f6f38d - size: 10008164 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 score_all_solvers@model2-test: - cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-20250514-thinking" == */* ]] - && mkdir -p - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking; + cmd: echo "Scoring";[[ "anthropic/claude-sonnet-4-6-thinking" == */* ]] && mkdir + -p test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking; cp - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval; - uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all - -S scorer_model=google/gemini-2.5-flash -S is_retrieverless=true - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=true test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval deps: - path: - test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + test_dvc_logs/solver_outputs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 16646547ee30427da219b0983ca788de - size: 1923240 + md5: 3107b1f474692f8919a571cee6e6cb3f + size: 14844232 params: params.yaml: - scorer_model: google/gemini-2.5-flash - sqa_scorer_version: may-23-2025 + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 outs: - - path: - test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-20250514-thinking.eval + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval hash: md5 - md5: 976cfd4e91bc6d97827ed4c6264f146c - size: 3797794 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 log_any_remaining_errors_and_record_scores@sqa_claude-4.0-dev: cmd: echo "Collecting errors";[[ "sqa_claude-4.0" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_claude-4.0"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.0"); @@ -3393,24 +3389,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_scispace.eval hash: md5 - md5: b797a0b8f9b0d80449cd01a3e7c8771f - size: 33056563 + md5: 4ddb3772fac4198d986a45acab9ef587 + size: 188249490 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_answer_precision_eval.csv hash: md5 - md5: ae2535020abf63dff6dbe2f7519bba65 - size: 24127154 + md5: 2da6012bc261ba26a6f22e70451fef2a + size: 24025759 - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_citation_eval.csv hash: md5 - md5: cc449723abafe0752eaf4bc20ec113f4 - size: 629170068 + md5: 6e89b26a8211c3c26efe1573131470f9 + size: 629204683 - path: test_dvc_logs/debug_logs/task_sqa_solver_scispace_rubric_eval.csv hash: md5 - md5: dab63ec769768be0927c4bd6a4c8b6e5 - size: 25049300 + md5: 88497a000bf73df14edbb1a397d9617e + size: 24536842 create_nice_logs@sqa_gemini-2.5-pro-test: cmd: echo "Creating logs"; [[ "sqa_gemini-2.5-pro" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-2.5-pro"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ @@ -3479,8 +3475,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 outs: - path: test_dvc_logs/errors/task_sqa_solver_openai/o3.md hash: md5 @@ -3488,8 +3484,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openai/o3.md hash: md5 - md5: 57a8d34df60af9a3aedf83c87fdf31ae - size: 248 + md5: dec5c11490332da95212a50822ffedcd + size: 255 create_nice_logs@openai/o3-test: cmd: echo "Creating logs"; [[ "openai/o3" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openai/o3"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_openai/o3.eval @@ -3497,24 +3493,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openai/o3.eval hash: md5 - md5: 335d405c1d2a62ae5a02be07dd9b27aa - size: 3231962 + md5: 0c6131316b8cc2004b48de09885b1ff8 + size: 64063525 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_answer_precision_eval.csv hash: md5 - md5: d08cc53367e91e9d19af2aa3a739116f - size: 3357861 + md5: 0aedf1e52fdede7fcf728bacc7d3d979 + size: 3070960 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_citation_eval.csv hash: md5 - md5: 2d41fbaa6791b963d1d52a6618071b45 - size: 15846570 + md5: 0eeabb899a7e0bf467de2c1380d4be07 + size: 12740876 - path: test_dvc_logs/debug_logs/task_sqa_solver_openai/o3_rubric_eval.csv hash: md5 - md5: 32209a17d1f07ba85c4dd76b2662cf0b - size: 3989258 + md5: c1c777c964929e2aba2d034a56629762 + size: 4696231 score_all_solvers@model17-test: cmd: echo "Scoring";[[ "you" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_you; cp test_dvc_logs/solver_outputs/task_sqa_solver_you.eval test_dvc_logs/scored/task_sqa_solver_you.eval; @@ -3637,21 +3633,23 @@ stages: md5: 071ed9683a227563dfd4630061740e98 size: 258 solve_scispace@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run inspect eval - astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=google/gemini-2.5-flash - -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls - -t test_dvc_logs/solver_outputs/*scispace.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace uv run --project agent-baselines + inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ + --solver + agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver + -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 + --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*scispace.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval hash: md5 - md5: 8726b423e371a6f02ebb8edf044da131 - size: 13192395 + md5: 31ee4f9ea8f0301cb6bd65cdc5cc0d23 + size: 33097436 log_any_remaining_errors_and_record_scores@sqa_o3_high-test: cmd: echo "Collecting errors";[[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_o3_high"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_o3_high"); @@ -3660,8 +3658,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 outs: - path: test_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md hash: md5 @@ -3669,7 +3667,7 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md hash: md5 - md5: 8fd06b654672a9a890c1949d71418507 + md5: 68db553d9f2385a47104ce032b7d4ba9 size: 255 extract_model_responses@sqa_o3_high-test: cmd: echo "Extracting responses"; [[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname @@ -3678,13 +3676,13 @@ stages: deps: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval hash: md5 - md5: 711b54ac018c9dc4f2cc190c3029c42e - size: 56666173 + md5: b84fd12987ed62bdff79a241f2c180d5 + size: 11657084 outs: - path: test_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv hash: md5 - md5: 3b46e7a92b471a879d146238971303c3 - size: 17090764 + md5: c9744f72c4f17b5fdab08c385566bb9c + size: 16625320 score_all_solvers@model18-test: cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_perplexity_dr; cp test_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval test_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval; @@ -3705,51 +3703,60 @@ stages: md5: eb8e7ffc96608f726ce82345c217179f size: 3783641 solve_futurehouse@model0-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py - -T scorer_model=google/gemini-2.5-flash -T split=test -S max_wait_time=900 -S - agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 + -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval hash: md5 - md5: 7ce6ee5a36b336b15929a5e35fc3e795 - size: 1443466 + md5: d3c2190ee23a647aed113be022cf072a + size: 17219690 solve_futurehouse@model1-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --extra - futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir - test_dvc_logs/solver_outputs/ --solver astabench/solvers/futurehouse/futurehouse_solver.py - -T scorer_model=google/gemini-2.5-flash -T split=test -S max_wait_time=900 -S - agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls - -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=test -S max_wait_time=900 + -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv + "$(ls -t test_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head + -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval hash: md5 - md5: 973247fa294552e3807747362ef43b53 - size: 76892171 + md5: 27f869df37299d6895881c95fb5a38d9 + size: 78478412 solve_sqa@o3_high-test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/sqa.py@sqa_solver -T split=test -T scorer_model=google/gemini-2.5-flash - -S completion_model=o3_high --limit=1000 --retry-on-error=10 --log-shared --no-score; - mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval 2>/dev/null | head -n1)" - "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*o3_high.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview sqa_solver_version: may-23-2025 outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval @@ -3757,21 +3764,23 @@ stages: md5: 711b54ac018c9dc4f2cc190c3029c42e size: 56666173 solve_openscholar@test: - cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --extra - sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir test_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/openscholar/memorized_solver.py -S path=astabench/solvers/sqa/openscholar/openscholar_cache_test.json - -T scorer_model=google/gemini-2.5-flash -T split=test --limit=1000 --retry-on-error=10 + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + -S + path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_test.json + -T scorer_model=google/gemini-3-flash-preview -T split=test --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*openscholar.eval 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval" params: params.yaml: limit: 1000 - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval hash: md5 - md5: 2c5b13dc69496592aa3990bbd92cdece - size: 1807380 + md5: 61ff1d386b502863144b85cc37a0b0a4 + size: 8231693 score_all_solvers@model19-test: cmd: echo "Scoring";[[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_openscholar; cp test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval test_dvc_logs/scored/task_sqa_solver_openscholar.eval; @@ -3799,8 +3808,8 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 8b382c2ac78c6d0d4a986ce29d91f81a - size: 4169841 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 outs: - path: test_dvc_logs/errors/task_sqa_solver_openscholar.md hash: md5 @@ -3808,8 +3817,8 @@ stages: size: 63 - path: test_dvc_logs/scores/task_sqa_solver_openscholar.md hash: md5 - md5: 261a9ce3237be6729dbe06cd225a38ee - size: 255 + md5: 751b4ae33c6fbf2ae2c69285488911ce + size: 264 create_nice_logs@openscholar-test: cmd: echo "Creating logs"; [[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "openscholar"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_openscholar.eval @@ -3817,24 +3826,24 @@ stages: deps: - path: test_dvc_logs/scored/task_sqa_solver_openscholar.eval hash: md5 - md5: 8b382c2ac78c6d0d4a986ce29d91f81a - size: 4169841 + md5: 2df72bb22d5ec61bedef231f58658fcc + size: 31959611 params: params.yaml: - scorer_model: google/gemini-2.5-flash + scorer_model: google/gemini-3-flash-preview outs: - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_answer_precision_eval.csv hash: md5 - md5: fdb84ee559cb4efd25b1eaf1f280b2c8 - size: 2134042 + md5: b265659404c464356fbce005e362e2c6 + size: 3520781 - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_citation_eval.csv hash: md5 - md5: cfecf88227640aeb201c402168f56569 - size: 3689118 + md5: 3af39a804f1471261586bd673b9d41e0 + size: 5538888 - path: test_dvc_logs/debug_logs/task_sqa_solver_openscholar_rubric_eval.csv hash: md5 - md5: 0f65b999cfd54471555b33f002f8c0c8 - size: 4089765 + md5: 40352702bf118fa9cf475de442681e9b + size: 4863839 extract_model_responses@openscholar-test: cmd: echo "Extracting responses"; [[ "openscholar" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname "openscholar"); uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_openscholar.eval @@ -3849,3 +3858,666 @@ stages: hash: md5 md5: 86bd8ce6f2e7ceb394d8430ff43a6e37 size: 1372454 + solve_sqa@claude-4.6-test: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*claude-4.6.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 + solve_sqa@gemini-3.1-pro-preview-test: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir test_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=test -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t test_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval + 2>/dev/null | head -n1)" "test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: f4da14bad73326e7cddbef9b1a243cbd + size: 9065 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 + solve_futurehouse@model0-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_crow uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S max_wait_time=900 + -S agent=CROW --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls + -t dev_dvc_logs/solver_outputs/*fhouse_crow.eval 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval + hash: md5 + md5: 4ff53a9992d562521979130e3bb36fcf + size: 1195651 + solve_futurehouse@model1-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_fhouse_falcon uv run --project + agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py + -T scorer_model=google/gemini-3-flash-preview -T split=dev -S max_wait_time=900 + -S agent=FALCON --limit=1000 --retry-on-error=10 --log-shared --no-score; mv + "$(ls -t dev_dvc_logs/solver_outputs/*fhouse_falcon.eval 2>/dev/null | head + -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval" + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_fhouse_falcon.eval + hash: md5 + md5: 2e13a36648af258fdb5bc75d71b00742 + size: 1195707 + solve_sqa@o3_high-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_o3_high uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=o3_high + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*o3_high.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 + solve_sqa@claude-4.6-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_claude-4.6 uv run --project + agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display + plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=claude-4.6 + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*claude-4.6.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + score_all_solvers@model5-dev: + cmd: echo "Scoring";[[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6; + cp dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval; + uv run inspect score --action overwrite --overwrite --display plain --scorer + astabench/evals/sqa/task.py@score_all -S scorer_model=google/gemini-3-flash-preview + -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + sqa_scorer_version: mar-09-2026 + outs: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + create_nice_logs@sqa_claude-4.6-dev: + cmd: echo "Creating logs"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ + task_sqa_solver_sqa_claude-4.6.eval dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv + hash: md5 + md5: 8476eab180a144e5d8b9e243b5b6e4f5 + size: 15194608 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_citation_eval.csv + hash: md5 + md5: 0066c1d4998f7e6c2e511bca70887b9d + size: 91757785 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv + hash: md5 + md5: a870b384bcf04f29fdf3fbb2f080d48f + size: 14844789 + create_nice_logs@sqa_o3_high-dev: + cmd: echo "Creating logs"; [[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv + hash: md5 + md5: ac887eb880a3273d2e5380304046736b + size: 20150178 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_citation_eval.csv + hash: md5 + md5: f530b493d460dc9a11be8abdf6356024 + size: 143239090 + - path: dev_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_rubric_eval.csv + hash: md5 + md5: 8be632f12a365cf6840a935576b6577b + size: 19154600 + log_any_remaining_errors_and_record_scores@sqa_o3_high-dev: + cmd: echo "Collecting errors";[[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_o3_high"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_o3_high"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md dev_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: a8a477a26e82358281bd4264e2c66d3a + size: 129493763 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_o3_high.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_o3_high.md + hash: md5 + md5: 361a16c88aa64c1da7c769091d008b04 + size: 256 + solve_sqa@gemini-3.1-pro-preview-dev: + cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_gemini-3.1-pro-preview + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa + --display plain --log-dir dev_dvc_logs/solver_outputs/ --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver + -T split=dev -T scorer_model=google/gemini-3-flash-preview -S completion_model=gemini-3.1-pro-preview + --limit=1000 --retry-on-error=10 --log-shared --no-score; mv "$(ls -t dev_dvc_logs/solver_outputs/*gemini-3.1-pro-preview.eval + 2>/dev/null | head -n1)" "dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval" + deps: + - path: agent-baselines/agent_baselines/solvers/sqa/sqa.py + hash: md5 + md5: 45aeed65c5f153adcacc3d447ea98238 + size: 9957 + - path: agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py + hash: md5 + md5: a127d3e76be6bb990bf79d0a610c9266 + size: 712 + params: + params.yaml: + limit: 1000 + scorer_model: google/gemini-3-flash-preview + sqa_solver_version: may-23-2025 + outs: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 + log_any_remaining_errors_and_record_scores@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Collecting errors";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md test_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: a3bc09c12b07468c5f0d55dfa997996c + size: 30307 + - path: test_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: 63bd872fbfe5144f5f03028c2e38f622 + size: 255 + extract_model_responses@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Extracting responses"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && + mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + deps: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: f0704bb08be81e64f61de8bad8869c57 + size: 6218859 + outs: + - path: + test_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + hash: md5 + md5: 340b4f59929ea22254ac228b59163219 + size: 8492028 + log_any_remaining_errors_and_record_scores@sqa_claude-4.6-dev: + cmd: echo "Collecting errors";[[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_claude-4.6"); mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.6"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md dev_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 56b55f652bf08d6a927f1d5b5740b255 + size: 106144422 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: 1325c56a874ce8e4daeaa5e81269b1e4 + size: 256 + create_nice_logs@sqa_o3_high-test: + cmd: echo "Creating logs"; [[ "sqa_o3_high" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_sqa_o3_high.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: de7965340653bbdfbac162c88a77d62a + size: 148894425 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_answer_precision_eval.csv + hash: md5 + md5: 38c29755259e07ad568d0b20d8c87957 + size: 24595597 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_citation_eval.csv + hash: md5 + md5: b5a199e69a9416dd2c5dee352a9e080a + size: 199803730 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_o3_high_rubric_eval.csv + hash: md5 + md5: b54f40c36bc2c406f937860fb41e0d6f + size: 22754983 + extract_model_responses@sqa_claude-4.6-dev: + cmd: echo "Extracting responses"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 1e9a28fec72a6077240104f2ce1cc1cf + size: 7465958 + outs: + - path: dev_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + hash: md5 + md5: 215866cacf7a1a8d777d33e83cdc0d42 + size: 10190157 + extract_model_responses@sqa_claude-4.6-test: + cmd: echo "Extracting responses"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/extract_model_responses.py test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + test_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + deps: + - path: test_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: ef3ac8ff8f95d0fc5028e29002f8d46c + size: 7482873 + outs: + - path: test_dvc_logs/model_responses/task_sqa_solver_sqa_claude-4.6_responses.csv + hash: md5 + md5: 11056e2841066f910c9a20cdbe57c2d6 + size: 10108274 + create_nice_logs@sqa_claude-4.6-test: + cmd: echo "Creating logs"; [[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname + "sqa_claude-4.6"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ + task_sqa_solver_sqa_claude-4.6.eval test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_answer_precision_eval.csv + hash: md5 + md5: a7c482283024eac90b33975e5e833c2f + size: 15052564 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_citation_eval.csv + hash: md5 + md5: 53d094e655f6e31c4f0e4e1949a2cc1f + size: 90802762 + - path: test_dvc_logs/debug_logs/task_sqa_solver_sqa_claude-4.6_rubric_eval.csv + hash: md5 + md5: 9fea6589050fd031ea658b5f57ab9f4f + size: 14660996 + create_nice_logs@sqa_gemini-3.1-pro-preview-test: + cmd: echo "Creating logs"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 509c201daa64d147a1b28a9a8096e968 + size: 78679963 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 202667a438fa88c60c851f1435d6cc74 + size: 12993016 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: d80c9bde6669317c8e90377d2c9e39bf + size: 70733966 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: a323039768948c6620b3bc5c0ddf1562 + size: 12781983 + log_any_remaining_errors_and_record_scores@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Collecting errors";[[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p dev_dvc_logs/errors/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + mkdir -p dev_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/log_errors.py dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md dev_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 + outs: + - path: dev_dvc_logs/errors/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: 8724eaf86f64b1311df733a31b2a23f2 + size: 45811 + - path: dev_dvc_logs/scores/task_sqa_solver_sqa_gemini-3.1-pro-preview.md + hash: md5 + md5: d813e4e2728ca1c9c63e46ad793fc82b + size: 255 + extract_model_responses@sqa_o3_high-dev: + cmd: echo "Extracting responses"; [[ "sqa_o3_high" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname + "sqa_o3_high"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_o3_high.eval + hash: md5 + md5: af8988dccb45671721dead59433ea82d + size: 10237840 + outs: + - path: dev_dvc_logs/model_responses/task_sqa_solver_sqa_o3_high_responses.csv + hash: md5 + md5: c789e14f0b3ec9e09b189723a27c4203 + size: 13934544 + create_nice_logs@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Creating logs"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && mkdir + -p dev_dvc_logs/debug_logs/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py dev_dvc_logs/scored/ task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/debug_logs/ + deps: + - path: dev_dvc_logs/scored/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: 4fdbc777c1b48ce65cf7039f2b280cdd + size: 88626955 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 41a0931d5caba776709fa2a8e139999f + size: 14292940 + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: 2aea34650fcf7c3888592a6b4608484c + size: 79977272 + - path: + dev_dvc_logs/debug_logs/task_sqa_solver_sqa_gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: 7067909219048d538c53daada5a0c6e3 + size: 13789837 + extract_model_responses@sqa_gemini-3.1-pro-preview-dev: + cmd: echo "Extracting responses"; [[ "sqa_gemini-3.1-pro-preview" == */* ]] && + mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname "sqa_gemini-3.1-pro-preview"); + uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + dev_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + deps: + - path: dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_gemini-3.1-pro-preview.eval + hash: md5 + md5: cbed474ff0e51bccf71f70d8d9d5d0c8 + size: 6467019 + outs: + - path: + dev_dvc_logs/model_responses/task_sqa_solver_sqa_gemini-3.1-pro-preview_responses.csv + hash: md5 + md5: 9a275b862ae580682e776e3765906097 + size: 9171862 + log_any_remaining_errors_and_record_scores@sqa_claude-4.6-test: + cmd: echo "Collecting errors";[[ "sqa_claude-4.6" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname + "sqa_claude-4.6"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "sqa_claude-4.6"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + test_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md test_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_sqa_claude-4.6.eval + hash: md5 + md5: 696310ea4b179f4511f0cffb02a86af4 + size: 103868942 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: 94a1c1680717a158e7f16dfc642bc55c + size: 15167 + - path: test_dvc_logs/scores/task_sqa_solver_sqa_claude-4.6.md + hash: md5 + md5: e597800f5c4eac984aad9a6f16bef594 + size: 255 + create_nice_logs@anthropic/claude-sonnet-4-6-thinking-test: + cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-6-thinking" == */* ]] + && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + hash: md5 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_answer_precision_eval.csv + hash: md5 + md5: a0e4fb47666252560e09588824a5e9df + size: 5049156 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_citation_eval.csv + hash: md5 + md5: 7e2b481df9feb79f2db794ede95c5c7b + size: 21829229 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking_rubric_eval.csv + hash: md5 + md5: e381ec30fc4da34f111af2cfaf415c70 + size: 6143276 + create_nice_logs@anthropic/claude-sonnet-4-6-test: + cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + hash: md5 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_answer_precision_eval.csv + hash: md5 + md5: 9a3e7d57814ac1dfa181843fe90579da + size: 4118799 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_citation_eval.csv + hash: md5 + md5: 231e97ff7e987f086b670826c981f249 + size: 18710123 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_anthropic/claude-sonnet-4-6_rubric_eval.csv + hash: md5 + md5: d66802329bfa67070d48a3ef983baf6a + size: 5279645 + log_any_remaining_errors_and_record_scores@google/gemini-3.1-pro-preview-test: + cmd: echo "Collecting errors";[[ "google/gemini-3.1-pro-preview" == */* ]] && + mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/errors/task_sqa_solver_google/gemini-3.1-pro-preview.md test_dvc_logs/scores/task_sqa_solver_google/gemini-3.1-pro-preview.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + hash: md5 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_google/gemini-3.1-pro-preview.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_google/gemini-3.1-pro-preview.md + hash: md5 + md5: 1b722f0d3c75681b9268257c29e6c599 + size: 255 + create_nice_logs@google/gemini-3.1-pro-preview-test: + cmd: echo "Creating logs"; [[ "google/gemini-3.1-pro-preview" == */* ]] && mkdir + -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "google/gemini-3.1-pro-preview"); + uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_google/gemini-3.1-pro-preview.eval + test_dvc_logs/debug_logs/ + deps: + - path: test_dvc_logs/scored/task_sqa_solver_google/gemini-3.1-pro-preview.eval + hash: md5 + md5: 7c3c78380c1953a6ce60ca8a81e75bdb + size: 53929324 + params: + params.yaml: + scorer_model: google/gemini-3-flash-preview + outs: + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_answer_precision_eval.csv + hash: md5 + md5: 9be4c3338c3597fb4edd8acc1c65ede9 + size: 2289533 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_citation_eval.csv + hash: md5 + md5: 2da75d6105f760753671ff82b5164610 + size: 6702396 + - path: + test_dvc_logs/debug_logs/task_sqa_solver_google/gemini-3.1-pro-preview_rubric_eval.csv + hash: md5 + md5: b1e5a66e405419e80d56d6fb83d3d9f0 + size: 4027808 + log_any_remaining_errors_and_record_scores@anthropic/claude-sonnet-4-6-thinking-test: + cmd: echo "Collecting errors";[[ "anthropic/claude-sonnet-4-6-thinking" == */* + ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6-thinking"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.eval + hash: md5 + md5: 89dff466d5b840c822f1730509db6c7e + size: 61631233 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6-thinking.md + hash: md5 + md5: dabd9b8e8fa8e96191e68cf6b8226f22 + size: 255 + log_any_remaining_errors_and_record_scores@anthropic/claude-sonnet-4-6-test: + cmd: echo "Collecting errors";[[ "anthropic/claude-sonnet-4-6" == */* ]] && mkdir + -p test_dvc_logs/errors/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-6"); + uv run scripts/log_errors.py test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6.md test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6.md + deps: + - path: test_dvc_logs/scored/task_sqa_solver_anthropic/claude-sonnet-4-6.eval + hash: md5 + md5: 28ffc1ba7ce9a349ec95ca854b5f1b3d + size: 51159281 + outs: + - path: test_dvc_logs/errors/task_sqa_solver_anthropic/claude-sonnet-4-6.md + hash: md5 + md5: d51d4f783de486edccf22aea1a28d40d + size: 63 + - path: test_dvc_logs/scores/task_sqa_solver_anthropic/claude-sonnet-4-6.md + hash: md5 + md5: f276952fdb1cba923fd12296aa7e5649 + size: 255 diff --git a/dvc.yaml b/dvc.yaml index e0afd4e5..2b874461 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -23,10 +23,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_sqa_${item.model} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/sqa.py@sqa_solver + --solver agent-baselines/agent_baselines/solvers/sqa/sqa.py@sqa_solver -T split=${item.split} -T scorer_model=${scorer_model} -S completion_model=${item.model} @@ -39,6 +39,9 @@ stages: - limit - scorer_model - sqa_solver_version + deps: + - agent-baselines/agent_baselines/solvers/sqa/sqa.py + - agent-baselines/agent_baselines/solvers/sqa/sqa_subprocess.py outs: - ${item.split}_dvc_logs/solver_outputs/task_sqa_solver_sqa_${item.model}.eval @@ -50,10 +53,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_elicit - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/elicit/memorized_solver.py@elicit_solver + --solver agent-baselines/agent_baselines/solvers/sqa/elicit/memorized_solver.py@elicit_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -74,10 +77,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_you - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_youcom.py@formatted_solver -S api_type='research' -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -98,12 +101,12 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_perplexity_dr - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ -T with_search_tools=False --model 'perplexity/sonar-deep-research' - --solver astabench/solvers/sqa/formatted_perplexity.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_perplexity.py@formatted_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -124,10 +127,10 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_storm - uv run --extra storm --python 3.11 + uv run --project agent-baselines/solvers/storm --python 3.11 --frozen -- inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/storm_solver.py@storm_solver + --solver agent-baselines/agent_baselines/solvers/sqa/storm_solver.py@storm_solver -T split=${item} -T scorer_model=${scorer_model} --limit=${limit} @@ -148,10 +151,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_scispace - uv run + uv run --project agent-baselines inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/scispace/scispace.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/scispace/scispace.py@formatted_solver -T scorer_model=${scorer_model} -T split=${item.split} --limit=${limit} @@ -174,11 +177,11 @@ stages: do: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_openscholar - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/openscholar/memorized_solver.py - -S path=astabench/solvers/sqa/openscholar/openscholar_cache_${item}.json + --solver agent-baselines/agent_baselines/solvers/sqa/openscholar/memorized_solver.py + -S path=agent-baselines/agent_baselines/solvers/sqa/openscholar/openscholar_cache_${item}.json -T scorer_model=${scorer_model} -T split=${item} --limit=${limit} @@ -217,10 +220,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.llm_name}${item.model.suffix} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/formatted_llm.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/formatted_llm.py@formatted_solver --model ${item.model.llm_name} ${item.model.llm_args} -T split=${item.split} -T scorer_model=${scorer_model} @@ -251,10 +254,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra futurehouse + uv run --project agent-baselines --extra futurehouse inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/futurehouse/futurehouse_solver.py + --solver agent-baselines/agent_baselines/solvers/futurehouse/futurehouse_solver.py -T scorer_model=${scorer_model} -T split=${item.split} -S max_wait_time=900 ${item.model.solver_args} @@ -281,10 +284,10 @@ stages: # run the solver then rename the resulting file to have a nice name: cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_sqa_solver_${item.model.name} - uv run --extra sqa + uv run --project agent-baselines --extra sqa inspect eval astabench/evals/sqa/task.py@sqa --display plain --log-dir ${item.split}_dvc_logs/solver_outputs/ - --solver astabench/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver + --solver agent-baselines/agent_baselines/solvers/sqa/general_memorized/memorized_solver.py@formatted_solver -T scorer_model=${scorer_model} -T split=${item.split} -S sys_name_or_path=${item.model.name} ${item.model.solver_args} @@ -317,7 +320,7 @@ stages: # sqa solvers: - name: sqa_claude-4.6 is_retrieverless: false - - name: sqa_gemini-3.1 + - name: sqa_gemini-3.1-pro-preview is_retrieverless: false - name: sqa_o3_high is_retrieverless: false @@ -349,6 +352,7 @@ stages: cp ${item.split}_dvc_logs/solver_outputs/task_sqa_solver_${item.model.name}.eval ${item.split}_dvc_logs/scored/task_sqa_solver_${item.model.name}.eval; uv run inspect score + --action overwrite --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all @@ -373,7 +377,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm @@ -407,7 +411,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm @@ -439,7 +443,7 @@ stages: - google/gemini-3.1-pro-preview - anthropic/claude-sonnet-4-6 - sqa_claude-4.6 - - sqa_gemini-3.1 + - sqa_gemini-3.1-pro-preview - sqa_o3_high - elicit - storm diff --git a/pyproject.toml b/pyproject.toml index 01d37c2f..e1c6c1f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,12 +10,12 @@ requires-python = ">=3.11" dependencies = [ "inspect_ai==0.3.114", "agent-eval[leaderboard]==0.1.46", - "openai>=1.78.0", # required by inspect + "openai>=2.30.0", # provider SDK version validated with the current model set "pydantic>=2.11.4", # required by inspect "litellm==1.82.3", "datasets~=3.2.0", "huggingface_hub", - "google-genai>=1.16.1", + "google-genai>=1.70.0", "nltk", "tabulate", "click==8.1.8", @@ -26,7 +26,7 @@ dependencies = [ "h2~=4.2.0", "pandas", "scipy", - "anthropic>=0.52.0", + "anthropic>=0.87.0", "platformdirs", "numpy", ] @@ -64,14 +64,6 @@ conflicts = [ [{extra = "sqa"}, {extra = "futurehouse"}], [{extra = "storm"}, {extra = "smolagents"}], ] -override-dependencies = [ - # sqa pins openai to a lower version than inspect requires - "openai==1.78.0", - - # STORM pretends to require a lower version, but doesn't actually need it: - # https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008 - "datasets~=3.2.0", -] [tool.flake8] select = [ diff --git a/tests/test_sqa_retry_utils.py b/tests/test_sqa_retry_utils.py new file mode 100644 index 00000000..24069979 --- /dev/null +++ b/tests/test_sqa_retry_utils.py @@ -0,0 +1,105 @@ +from types import SimpleNamespace + +import pytest + +from astabench.evals.sqa.retry_utils import generate_with_retry +from astabench.evals.sqa.rubric import RubricCorpusQaGenericMetric + + +class FakeModel: + def __init__(self, completions): + self._completions = iter(completions) + + async def generate(self, prompt_or_messages, config): + return SimpleNamespace(completion=next(self._completions)) + + +@pytest.mark.asyncio +async def test_generate_with_retry_retries_on_parsed_validator_failure(): + model = FakeModel( + [ + '{"scores": [{"criteria_idx": 1}]}', + '{"scores": [{"criteria_idx": 1}, {"criteria_idx": 2}]}', + ] + ) + + def validate(parsed): + indices = [score["criteria_idx"] for score in parsed["scores"]] + if indices != [1, 2]: + raise ValueError(f"incomplete criteria indices: {indices}") + + _, parsed, num_retries = await generate_with_retry( + model=model, + prompt_or_messages=[], + config=SimpleNamespace(), + max_retries=1, + base_delay=0, + parsed_validator=validate, + ) + + assert [score["criteria_idx"] for score in parsed["scores"]] == [1, 2] + assert num_retries == 1 + + +@pytest.mark.asyncio +async def test_joint_rubric_assessment_retries_until_all_criteria_are_scored(): + model = FakeModel( + [ + """{ + "scores": [ + { + "criteria": "criterion 1", + "criteria_idx": 1, + "reasoning": "partial response", + "score": 2, + "evidence": "evidence 1" + } + ] + }""", + """{ + "scores": [ + { + "criteria": "criterion 1", + "criteria_idx": 1, + "reasoning": "covers criterion 1", + "score": 2, + "evidence": "evidence 1" + }, + { + "criteria": "criterion 2", + "criteria_idx": 2, + "reasoning": "covers criterion 2", + "score": 1, + "evidence": "evidence 2" + } + ] + }""", + ] + ) + metric = RubricCorpusQaGenericMetric( + config={ + "question": "Test question", + "ingredients": [ + { + "name": "criterion_a", + "criterion": "Assess criterion A", + "weight": 0.5, + "examples": ["example A"], + }, + { + "name": "criterion_b", + "criterion": "Assess criterion B", + "weight": 0.5, + "examples": ["example B"], + }, + ], + }, + model=model, + ) + + score_components, prompt_logs = await metric._assess_properties_jointly( + "candidate response", metric.config.ingredients + ) + + assert score_components == {"criterion_a": 1.0, "criterion_b": 0.5} + assert prompt_logs["num_retries"][0]["data"]["num_retries"] == 1