Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions agent_baselines/solvers/sqa/format_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"openai/gpt-4.1": 32_000,
"anthropic/claude-sonnet-3.7-20250219": 16_000,
"google/gemini-2.5-flash": 64_000,
"google/gemini-3-flash-preview": 64_000,
"default": 32_000,
}

Expand Down
2 changes: 1 addition & 1 deletion agent_baselines/solvers/sqa/formatted_fhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ def formatted_solver(
polling_interval=polling_interval,
extract_snippets=True,
),
format_solver("google/gemini-2.5-flash-preview-05-20"),
format_solver("google/gemini-3-flash-preview"),
]
return chain(chainlist)
2 changes: 1 addition & 1 deletion agent_baselines/solvers/sqa/formatted_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ def formatted_solver(
) -> Solver:
chainlist = [
llm_with_prompt(system_prompt),
format_solver("google/gemini-2.5-flash-preview-05-20", require_snippets=False),
format_solver("google/gemini-3-flash-preview", require_snippets=False),
]
return chain(chainlist)
2 changes: 1 addition & 1 deletion agent_baselines/solvers/sqa/formatted_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def formatted_solver(
reasoning_effort: str = "",
search_mode: str = "",
require_snippets: bool = True,
scorer_model: str = "google/gemini-2.5-flash-preview-05-20",
scorer_model: str = "google/gemini-3-flash-preview",
) -> Solver:
chainlist = [
perplexity_solver(
Expand Down
2 changes: 1 addition & 1 deletion agent_baselines/solvers/sqa/formatted_youcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,6 @@ def formatted_solver(
) -> Solver:
chainlist = [
youcom_solver(api_type=api_type, instructions=system_prompt),
format_solver("google/gemini-2.5-flash-preview-05-20"),
format_solver("google/gemini-3-flash-preview"),
]
return chain(chainlist)
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def formatted_solver(
chainlist = [
memorized_solver(),
format_solver(
"google/gemini-2.5-flash-preview-05-20", require_snippets=require_snippets
"google/gemini-3-flash-preview", require_snippets=require_snippets
),
]
return chain(chainlist)
2 changes: 1 addition & 1 deletion agent_baselines/solvers/sqa/scispace/scispace.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,6 @@ async def solve(state, generate):
def formatted_solver() -> Solver:
chainlist = [
memorized_solver(),
format_solver("google/gemini-2.5-flash-preview-05-20"),
format_solver("google/gemini-3-flash-preview"),
]
return chain(chainlist)
5 changes: 5 additions & 0 deletions agent_baselines/solvers/sqa/sqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,17 @@
claude_4_0 = "anthropic/claude-sonnet-4-20250514"
claude_3_7 = "anthropic/claude-3-7-sonnet-20250219"
claude_3_5 = "anthropic/claude-3-5-sonnet-20240620"
claude_4_6 = "anthropic/claude-sonnet-4-6"
gemini_2_5_pro = "gemini/gemini-2.5-pro-preview-03-25"
gemini_3_1_pro = "gemini/gemini-3.1-pro-preview"
completion_model_map = {
"claude-3.7": claude_3_7,
"claude-3.5": claude_3_5,
"claude-4.0": claude_4_0,
"claude-4.6": claude_4_6,
"gemini-2.5-pro": gemini_2_5_pro,
"gemini-3.1-pro-preview": gemini_3_1_pro,
"o3_high": "openai/o3",
}

RERANKER_TYPES = list(RERANKER_MAPPING.keys())
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ version = "0.1.1"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"astabench==0.3.1",
"astabench>=0.3.1",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -76,7 +76,8 @@ conflicts = [
]
override-dependencies = [
# sqa pins openai to a lower version than inspect requires
"openai==1.78.0",
"openai==2.28.0",
"inspect_ai==0.3.143",

# STORM pretends to require a lower version, but doesn't actually need it:
# https://github.com/allenai/asta-bench/issues/31#issuecomment-3045978008
Expand Down
2 changes: 1 addition & 1 deletion tests/test_basic_mockllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_smoke_mockllm_arithmetic():
# Just smoke-test that a basic mocked eval can run without crashing; it's
# mainly a test of whether dependencies and imports all work
inspect_ai.eval(
"astabench/evals/demo/arithmetic/task.py",
"astabench/arithmetic_demo",
model="mockllm/model",
solver=SolverSpec(
"agent_baselines/solvers/llm.py@llm_with_prompt",
Expand Down
Loading