Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -196,3 +196,6 @@ uv.lock

# AI docs
CLAUDE.md
/mylitqaruns
/perplitqa
/MYSTORM
6 changes: 6 additions & 0 deletions MYSTORM.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 3b9f3dfce362059f815d0f797a1d4ed1.dir
size: 15575711
nfiles: 2
hash: md5
path: MYSTORM
14 changes: 12 additions & 2 deletions astabench/evals/labbench/litqa2/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import random
import string
from typing import Literal, NamedTuple, cast
import os

import pandas as pd
from datasets import Dataset, DatasetDict, IterableDatasetDict, load_dataset
from huggingface_hub import hf_hub_download
from inspect_ai import Task, task, task_with
from inspect_ai.dataset import MemoryDataset, Sample
from inspect_ai.scorer import (
Expand Down Expand Up @@ -201,11 +203,17 @@ def load_litqa2(split: Literal["dev", "test", "all"] = "all") -> list[dict]:
if split == "all":
return list(map(dict, dataset["train"]))

litqa_mappings = load_dataset(
p = hf_hub_download(
ASTA_BENCH_DATASET_REPO,
data_files="tasks/labbench/litqa2_mapping.json",
"tasks/labbench/litqa2_mapping.json",
token=os.getenv("HF_TOKEN"),
repo_type="dataset",
revision=ASTA_BENCH_DATASET_REVISION,
)
litqa_mappings = load_dataset(
"json",
data_files=p,
)

df = pd.merge(
dataset["train"].to_pandas(),
Expand Down Expand Up @@ -282,6 +290,8 @@ def litqa2(*litqa_args, **litqa_kwargs):
textin/textout format instead of Inspect's `state.choices`. See litqa2()
for more task details."""

litqa_kwargs["with_search_tools"] = False
litqa_kwargs["split"] = "test"
base_task = litqa2_inspect(*litqa_args, **litqa_kwargs)

dataset = []
Expand Down
111 changes: 72 additions & 39 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1563,17 +1563,17 @@ stages:
deps:
- path: dev_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: e7d45e7b258d42bf6ff558cd006704c7
size: 11864550
md5: 97f48e62c18b183287cdc4c6865e58cd
size: 12376461
outs:
- path: dev_dvc_logs/errors/task_sqa_solver_storm.md
hash: md5
md5: d51d4f783de486edccf22aea1a28d40d
size: 63
- path: dev_dvc_logs/scores/task_sqa_solver_storm.md
hash: md5
md5: 0f45e2a63de4bb733c9ccaa0096d2cfd
size: 258
md5: 75fd2c3700e0909c5f0b6dfe9b0b761b
size: 255
log_any_remaining_errors_and_record_scores@scispace-test:
cmd: echo "Collecting errors";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/errors/task_sqa_solver_$(dirname
"scispace"); mkdir -p test_dvc_logs/scores/task_sqa_solver_$(dirname "scispace");
Expand Down Expand Up @@ -2042,21 +2042,21 @@ stages:
deps:
- path: dev_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: e7d45e7b258d42bf6ff558cd006704c7
size: 11864550
md5: 97f48e62c18b183287cdc4c6865e58cd
size: 12376461
outs:
- path: dev_dvc_logs/debug_logs/task_sqa_solver_storm_answer_precision_eval.csv
hash: md5
md5: c0c459c473d06d6ea34639f96c7cbd33
size: 16555337
md5: 47b464a14a21d661fd653783bad2660a
size: 16670397
- path: dev_dvc_logs/debug_logs/task_sqa_solver_storm_citation_eval.csv
hash: md5
md5: 330415d5d61ee3a7be3e34715d7b35f5
size: 111187433
md5: 429e713b73be994d1800ae31aaca19ce
size: 113172536
- path: dev_dvc_logs/debug_logs/task_sqa_solver_storm_rubric_eval.csv
hash: md5
md5: f44e2ca8fc7ef0f4a0852239526907e7
size: 14538513
md5: c88aa94a89a82f4417129e15c1984c9c
size: 15433813
create_nice_logs@fhouse_crow-test:
cmd: echo "Creating logs"; [[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname
"fhouse_crow"); uv run scripts/create_debug_logs.py test_dvc_logs/scored/ task_sqa_solver_fhouse_crow.eval
Expand Down Expand Up @@ -2400,8 +2400,8 @@ stages:
outs:
- path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval
hash: md5
md5: e9f0f203bcfba80faf017d7545d35ed6
size: 4914841
md5: b5dd1f633d5c33253de4969301614a19
size: 5972462
create_nice_logs@anthropic/claude-sonnet-4-20250514-test:
cmd: echo "Creating logs"; [[ "anthropic/claude-sonnet-4-20250514" == */* ]] &&
mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname "anthropic/claude-sonnet-4-20250514");
Expand Down Expand Up @@ -2551,13 +2551,13 @@ stages:
deps:
- path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval
hash: md5
md5: e9f0f203bcfba80faf017d7545d35ed6
size: 4914841
md5: b5dd1f633d5c33253de4969301614a19
size: 5972462
outs:
- path: test_dvc_logs/model_responses/task_sqa_solver_storm_responses.csv
hash: md5
md5: 36f7918f26cea5397c9ef0ba7d183022
size: 12043391
md5: f654209834c01edafe19a8508f651ba2
size: 12383893
extract_model_responses@sqa_claude-4.0-dev:
cmd: echo "Extracting responses"; [[ "sqa_claude-4.0" == */* ]] && mkdir -p dev_dvc_logs/model_responses/task_sqa_solver_$(dirname
"sqa_claude-4.0"); uv run scripts/extract_model_responses.py dev_dvc_logs/solver_outputs/task_sqa_solver_sqa_claude-4.0.eval
Expand Down Expand Up @@ -2722,16 +2722,16 @@ stages:
deps:
- path: test_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: 623b39e2eefa43e4d80431478bb203d2
size: 11677399
md5: e3ff9965c9b0b5b16aecd9562322b064
size: 15494227
outs:
- path: test_dvc_logs/errors/task_sqa_solver_storm.md
hash: md5
md5: d51d4f783de486edccf22aea1a28d40d
size: 63
- path: test_dvc_logs/scores/task_sqa_solver_storm.md
hash: md5
md5: 694b9ca1ddf5101c219761bf3d462fdf
md5: 6639f2d3170b15cb08a9d49998b8d4d3
size: 255
create_nice_logs@storm-test:
cmd: echo "Creating logs"; [[ "storm" == */* ]] && mkdir -p test_dvc_logs/debug_logs/task_sqa_solver_$(dirname
Expand All @@ -2740,21 +2740,21 @@ stages:
deps:
- path: test_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: 623b39e2eefa43e4d80431478bb203d2
size: 11677399
md5: e3ff9965c9b0b5b16aecd9562322b064
size: 15494227
outs:
- path: test_dvc_logs/debug_logs/task_sqa_solver_storm_answer_precision_eval.csv
hash: md5
md5: ab52f71f30a9d2a597ebb34f88262d79
size: 16080979
md5: 9bfe7de194fed56642a0a642e0b26b8f
size: 16456085
- path: test_dvc_logs/debug_logs/task_sqa_solver_storm_citation_eval.csv
hash: md5
md5: d11a191fbad1750210a95913616e52b0
size: 114307815
md5: 23ace6f751c85838448516a1aa4b5c9a
size: 118676192
- path: test_dvc_logs/debug_logs/task_sqa_solver_storm_rubric_eval.csv
hash: md5
md5: 7f5ce9d87cebc8b11ed539f793ee9ea8
size: 14245561
md5: d17a5e5dd8873bb5c82b982a6e9940b9
size: 17032728
score_all_solvers@model12-test:
cmd: echo "Scoring";[[ "scispace" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_scispace;
cp test_dvc_logs/solver_outputs/task_sqa_solver_scispace.eval test_dvc_logs/scored/task_sqa_solver_scispace.eval;
Expand Down Expand Up @@ -2797,21 +2797,21 @@ stages:
cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_storm;
cp test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval test_dvc_logs/scored/task_sqa_solver_storm.eval;
uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all
-S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval
-S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false test_dvc_logs/scored/task_sqa_solver_storm.eval
deps:
- path: test_dvc_logs/solver_outputs/task_sqa_solver_storm.eval
hash: md5
md5: e9f0f203bcfba80faf017d7545d35ed6
size: 4914841
md5: b5dd1f633d5c33253de4969301614a19
size: 5972462
params:
params.yaml:
scorer_model: google/gemini-2.5-pro
scorer_model: google/gemini-2.5-flash
sqa_scorer_version: may-23-2025
outs:
- path: test_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: 623b39e2eefa43e4d80431478bb203d2
size: 11677399
md5: e3ff9965c9b0b5b16aecd9562322b064
size: 15494227
score_all_solvers@model13-test:
cmd: echo "Scoring";[[ "fhouse_crow" == */* ]] && mkdir -p test_dvc_logs/scored/task_sqa_solver_fhouse_crow;
cp test_dvc_logs/solver_outputs/task_sqa_solver_fhouse_crow.eval test_dvc_logs/scored/task_sqa_solver_fhouse_crow.eval;
Expand Down Expand Up @@ -3472,21 +3472,21 @@ stages:
cmd: echo "Scoring";[[ "storm" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_storm;
cp dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval dev_dvc_logs/scored/task_sqa_solver_storm.eval;
uv run inspect score --overwrite --display plain --scorer astabench/evals/sqa/task.py@score_all
-S scorer_model=google/gemini-2.5-pro -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_storm.eval
-S scorer_model=google/gemini-2.5-flash -S is_retrieverless=false dev_dvc_logs/scored/task_sqa_solver_storm.eval
deps:
- path: dev_dvc_logs/solver_outputs/task_sqa_solver_storm.eval
hash: md5
md5: f3a50567cc194c50f8c4f7e19991b08c
size: 4996230
params:
params.yaml:
scorer_model: google/gemini-2.5-pro
scorer_model: google/gemini-2.5-flash
sqa_scorer_version: may-23-2025
outs:
- path: dev_dvc_logs/scored/task_sqa_solver_storm.eval
hash: md5
md5: e7d45e7b258d42bf6ff558cd006704c7
size: 11864550
md5: 97f48e62c18b183287cdc4c6865e58cd
size: 12376461
score_all_solvers@model17-dev:
cmd: echo "Scoring";[[ "perplexity_dr" == */* ]] && mkdir -p dev_dvc_logs/scored/task_sqa_solver_perplexity_dr;
cp dev_dvc_logs/solver_outputs/task_sqa_solver_perplexity_dr.eval dev_dvc_logs/scored/task_sqa_solver_perplexity_dr.eval;
Expand Down Expand Up @@ -3686,3 +3686,36 @@ stages:
hash: md5
md5: 415d07f1989b99afd7298c761f9438fa
size: 255
run_litqa2_you:
cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_litqa2_solver_you uv run inspect eval
astabench/evals/labbench/litqa2/task.py@litqa2 --display plain --log-dir litqa2_dvc_logs/
--solver astabench/solvers/youcom.py@youcom_solver -S api_type='research' --no-fail-on-error
--limit=1; mv "$(ls -t litqa2_dvc_logs/*task_litqa2_solver_you.eval 2>/dev/null
| head -n1)" "litqa2_dvc_logs/task_litqa2_solver_you.eval"
outs:
- path: litqa2_dvc_logs/task_litqa2_solver_you.eval
hash: md5
md5: a5a1bebd61bce9b35751dccfa2c52a3c
size: 5358
run_litqa2_perplexity:
cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_litqa2_solver_perplexity uv run inspect
eval astabench/evals/labbench/litqa2/task.py@litqa2 --display plain --log-dir
litqa2_dvc_logs/ --model 'perplexity/sonar-deep-research' --solver astabench/solvers/sqa/perplexity_base.py@perplexity_solver
--no-fail-on-error -T split=test --limit=1; mv "$(ls -t litqa2_dvc_logs/*task_litqa2_solver_perplexity.eval
2>/dev/null | head -n1)" "litqa2_dvc_logs/task_litqa2_solver_perplexity.eval"
outs:
- path: litqa2_dvc_logs/task_litqa2_solver_perplexity.eval
hash: md5
md5: 33ecb5e0bc723947ab12c0f92adfb9f9
size: 9180
run_litqa2_storm:
cmd: INSPECT_EVAL_LOG_FILE_PATTERN=task_litqa2_solver_storm uv run --extra storm
--python 3.11 inspect eval astabench/evals/labbench/litqa2/task.py@litqa2 --display
plain --log-dir litqa2_dvc_logs/ --solver astabench/solvers/sqa/storm_solver.py@storm_solver
--no-fail-on-error --limit=1; mv "$(ls -t litqa2_dvc_logs/*task_litqa2_solver_storm.eval
2>/dev/null | head -n1)" "litqa2_dvc_logs/task_litqa2_solver_storm.eval"
outs:
- path: litqa2_dvc_logs/task_litqa2_solver_storm.eval
hash: md5
md5: 2f0a063d8d4dffc382221751aa3be554
size: 43306
29 changes: 29 additions & 0 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@ vars:
suffix: ""

stages:
# - perplexity
# - you
run_litqa2_you:
cmd:
INSPECT_EVAL_LOG_FILE_PATTERN=task_litqa2_solver_you
uv run --extra litqa2 --extra inspect_evals
inspect eval astabench/evals/labbench/litqa2/task.py@litqa2
--display plain
--log-dir litqa2_dvc_logs/
--solver astabench/solvers/youcom.py@youcom_solver -S api_type='research'
--no-fail-on-error
mv "$(ls -t litqa2_dvc_logs/*task_litqa2_solver_you.eval 2>/dev/null | head -n1)" "litqa2_dvc_logs/task_litqa2_solver_you.eval"
outs:
- litqa2_dvc_logs/task_litqa2_solver_you.eval

run_litqa2_perplexity:
cmd:
INSPECT_EVAL_LOG_FILE_PATTERN=task_litqa2_solver_perplexity
uv run --extra litqa2 --extra inspect_evals
inspect eval astabench/evals/labbench/litqa2/task.py@litqa2
--display plain
--log-dir litqa2_dvc_logs/
--model 'perplexity/sonar-deep-research'
--solver astabench/solvers/sqa/perplexity_base.py@perplexity_solver
--no-fail-on-error
mv "$(ls -t litqa2_dvc_logs/*task_litqa2_solver_perplexity.eval 2>/dev/null | head -n1)" "litqa2_dvc_logs/task_litqa2_solver_perplexity.eval"
outs:
- litqa2_dvc_logs/task_litqa2_solver_perplexity.eval

solve_sqa:
matrix:
model:
Expand Down
11 changes: 11 additions & 0 deletions litqa2_dvc_logs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/task_litqa2_solver_you.eval
/task_litqa2_solver_perplexity.eval
/task_litqa2_solver_storm.eval
/2025-07-10T23-29-38+00-00_task_litqa2_solver_you.eval
/2025-07-10T23-31-08+00-00_task_litqa2_solver_you.eval
/2025-07-10T23-44-44+00-00_task_litqa2_solver_you.eval
/2025-07-10T23-57-48+00-00_task_litqa2_solver_you.eval
/2025-07-11T00-08-04+00-00_task_litqa2_solver_perplexity.eval
/2025-07-11T00-12-06+00-00_task_litqa2_solver_perplexity.eval
/2025-07-11T00-32-41+00-00_task_litqa2_solver_perplexity.eval
/2025-07-11T00-50-41+00-00_task_litqa2_solver_perplexity.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 5421087894711b618a9ecd7e47e51220
size: 178009
hash: md5
path: 2025-07-10T23-29-38+00-00_task_litqa2_solver_you.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 561e2784b97229f9fb6776037f36e677
size: 3905491
hash: md5
path: 2025-07-10T23-31-08+00-00_task_litqa2_solver_you.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: fa218239ebec00a7f13d1f04c4c7a0ea
size: 3715517
hash: md5
path: 2025-07-10T23-44-44+00-00_task_litqa2_solver_you.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: b7c2dbdca4b47416656609ef1cb402c5
size: 26888
hash: md5
path: 2025-07-10T23-57-48+00-00_task_litqa2_solver_you.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: e22344cc100746418bf07dc3ba7d3850
size: 492140
hash: md5
path: 2025-07-11T00-08-04+00-00_task_litqa2_solver_perplexity.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 1f345a57a81e815d4acde69804c08299
size: 5516493
hash: md5
path: 2025-07-11T00-12-06+00-00_task_litqa2_solver_perplexity.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: c7584f690f8f226b48c2335c22dab510
size: 5554749
hash: md5
path: 2025-07-11T00-32-41+00-00_task_litqa2_solver_perplexity.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: b7c8aae966d8d66f95f147e1650fa441
size: 3420916
hash: md5
path: 2025-07-11T00-50-41+00-00_task_litqa2_solver_perplexity.eval
5 changes: 5 additions & 0 deletions litqa2_dvc_logs/task_litqa2_solver_storm.eval.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
outs:
- md5: 2f0a063d8d4dffc382221751aa3be554
size: 43306
hash: md5
path: task_litqa2_solver_storm.eval
6 changes: 6 additions & 0 deletions mylitqaruns.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 01bc3be3ff112a6925df3e783eaa82f3.dir
size: 185154
nfiles: 2
hash: md5
path: mylitqaruns
2 changes: 1 addition & 1 deletion params.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
scorer_model: "google/gemini-2.5-pro"
scorer_model: "google/gemini-2.5-flash"
sqa_scorer_version: 'may-23-2025'
sqa_solver_version: 'may-23-2025'
limit: 1000
6 changes: 6 additions & 0 deletions perplitqa.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 60e514efd32c3c49b582d67fd727e193.dir
size: 527892
nfiles: 2
hash: md5
path: perplitqa
Loading