Skip to content

Commit f47a276

Browse files
[PB] Use chz for configuring Monitor and Judge scripts (#92)
1 parent 785c7e8 commit f47a276

14 files changed

Lines changed: 330 additions & 284 deletions

File tree

project/paperbench/experiments/judge_eval/judge_eval_perf_cost.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from matplotlib import container
66
from matplotlib.ticker import MultipleLocator
77

8+
from paperbench.utils import normalize_not_given
9+
810
MODEL_COST_PER_TOKEN = {
911
"gpt-4o-2024-08-06": {"in": 0.0000025, "out": 0.00001},
1012
"gpt-4o": {"in": 0.0000025, "out": 0.00001},
@@ -72,10 +74,23 @@ def read_model_results() -> dict:
7274
with open(path, "r") as f:
7375
data = json.load(f)
7476

75-
model_name = data["model_name"]
76-
reasoning_effort = (
77-
data.get("judge_kwargs", {}).get("completion_kwargs", {}).get("reasoning_effort", None)
78-
)
77+
judge_kwargs = data.get("judge_kwargs", {})
78+
completer_config = judge_kwargs.get("completer_config")
79+
80+
if completer_config is not None:
81+
model_name = completer_config.get("model")
82+
reasoning_effort = normalize_not_given(completer_config.get("reasoning_effort"))
83+
else:
84+
# support older format
85+
model_name = data.get("model_name")
86+
reasoning_effort = (
87+
data.get("judge_kwargs", {})
88+
.get("completion_kwargs", {})
89+
.get("reasoning_effort", None)
90+
)
91+
reasoning_effort = normalize_not_given(reasoning_effort)
92+
model_name = model_name or "unknown_model"
93+
7994
key = f"{model_name}_{reasoning_effort}" if reasoning_effort else model_name
8095
model_results[key] = {
8196
"model_name": model_name,

project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pandas as pd
44

5+
from paperbench.utils import normalize_not_given
6+
57

68
def read_model_results() -> dict:
79
"""
@@ -22,10 +24,23 @@ def read_model_results() -> dict:
2224
with open(path, "r") as f:
2325
data = json.load(f)
2426

25-
model_name = data["model_name"]
26-
reasoning_effort = (
27-
data.get("judge_kwargs", {}).get("completion_kwargs", {}).get("reasoning_effort", None)
28-
)
27+
judge_kwargs = data.get("judge_kwargs", {})
28+
completer_config = judge_kwargs.get("completer_config")
29+
30+
if completer_config is not None:
31+
model_name = completer_config.get("model")
32+
reasoning_effort = normalize_not_given(completer_config.get("reasoning_effort"))
33+
else:
34+
# support older format
35+
model_name = data.get("model_name")
36+
reasoning_effort = (
37+
data.get("judge_kwargs", {})
38+
.get("completion_kwargs", {})
39+
.get("reasoning_effort", None)
40+
)
41+
reasoning_effort = normalize_not_given(reasoning_effort)
42+
model_name = model_name or "unknown_model"
43+
2944
key = f"{model_name}_{reasoning_effort}" if reasoning_effort else model_name
3045
model_results[key] = {
3146
"model_name": model_name,

project/paperbench/experiments/judge_eval/run_judge_eval_sweep.sh

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,55 @@
22

33
export OPENAI_API_KEY="<JUDGE_EVAL_API_KEY>"
44

5+
COMPLETER_CONFIG="preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config"
6+
EXAMPLE_IDS="pinn/0,rice/0,stay-on-topic-with-classifier-free-guidance/0,all-in-one/0,semantic-self-consistency/0"
7+
OUTPUT_DIR="experiments/judge_eval/judge_eval_results/"
8+
59
if [ "$OPENAI_API_KEY" = "<JUDGE_EVAL_API_KEY>" ]; then
610
echo "Error: Please set a valid OpenAI API key in the script. Replace <JUDGE_EVAL_API_KEY> with the judge eval API key."
711
exit 1
812
fi
913

1014
for model in o3-mini-2025-01-31 o1-2024-12-17 o1-mini-2024-09-12; do
1115
echo "Running judge eval for $model-high"
12-
python paperbench/scripts/run_judge_eval.py -j simple -m $model \
13-
--reasoning-effort high \
14-
--output-dir experiments/judge_eval/judge_eval_results/ \
15-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
16+
python paperbench/scripts/run_judge_eval.py \
17+
judge=simple \
18+
completer_config="$COMPLETER_CONFIG" \
19+
completer_config.model=$model \
20+
completer_config.reasoning_effort=high \
21+
output_dir=$OUTPUT_DIR \
22+
example_ids=$EXAMPLE_IDS
1623
echo "-----------------------------"
1724
done
1825

1926
for model in gpt-4o-mini-2024-07-18 gpt-4o-2024-08-06; do
2027
echo "Running judge eval for $model"
21-
python paperbench/scripts/run_judge_eval.py -j simple -m $model \
22-
--reasoning-effort none \
23-
--output-dir experiments/judge_eval/judge_eval_results/ \
24-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
28+
python paperbench/scripts/run_judge_eval.py \
29+
judge=simple \
30+
completer_config="$COMPLETER_CONFIG" \
31+
completer_config.model=$model \
32+
output_dir=$OUTPUT_DIR \
33+
example_ids=$EXAMPLE_IDS
2534
echo "-----------------------------"
2635
done
2736

28-
python paperbench/scripts/run_judge_eval.py -j random \
29-
--output-dir experiments/judge_eval/judge_eval_results/ \
30-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
37+
python paperbench/scripts/run_judge_eval.py \
38+
judge=random \
39+
output_dir=$OUTPUT_DIR \
40+
example_ids=$EXAMPLE_IDS
3141

32-
python paperbench/scripts/run_judge_eval.py -j dummy \
33-
--output-dir experiments/judge_eval/judge_eval_results/ \
34-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
42+
python paperbench/scripts/run_judge_eval.py \
43+
judge=dummy \
44+
output_dir=$OUTPUT_DIR \
45+
example_ids=$EXAMPLE_IDS
3546

3647
# finally, single run of judge-eval on o3-mini-high with --code-only
3748
# to be able to compare token counts with default PaperBench
38-
python paperbench/scripts/run_judge_eval.py -j simple -m o3-mini-2025-01-31 \
39-
--reasoning-effort high \
40-
--output-dir experiments/judge_eval/judge_eval_results/code_only \
41-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 \
42-
--code-only
49+
python paperbench/scripts/run_judge_eval.py \
50+
judge=simple \
51+
completer_config="$COMPLETER_CONFIG" \
52+
completer_config.model=o3-mini-2025-01-31 \
53+
completer_config.reasoning_effort=high \
54+
output_dir=experiments/judge_eval/judge_eval_results/code_only \
55+
example_ids=$EXAMPLE_IDS \
56+
code_only=true

project/paperbench/experiments/judge_max_depth/run.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,21 @@ SUBMISSION_PATH="data/judge_eval/rice/0/submission"
33
PAPER_ID="rice"
44
JUDGE="simple"
55
MODEL="o3-mini-2025-01-31"
6+
COMPLETER_CONFIG="preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config"
67
for DEPTH in 1 2 3 4 100; do
78
for SEED in 1 2 3; do
89
OUTPUT_PATH="experiments/judge_max_depth/results/judge_depth${DEPTH}_seed${SEED}"
910
mkdir -p $OUTPUT_PATH
10-
python paperbench/scripts/run_judge.py --submission-path $SUBMISSION_PATH --paper-id $PAPER_ID --judge $JUDGE --model $MODEL --out-dir $OUTPUT_PATH -d $DEPTH > "$OUTPUT_PATH/stdout.log" 2> "$OUTPUT_PATH/run_judge.log" &
11+
python paperbench/scripts/run_judge.py \
12+
submission_path=$SUBMISSION_PATH \
13+
paper_id=$PAPER_ID \
14+
judge=$JUDGE \
15+
completer_config="$COMPLETER_CONFIG" \
16+
completer_config.model=$MODEL \
17+
completer_config.reasoning_effort="high" \
18+
out_dir=$OUTPUT_PATH \
19+
max_depth=$DEPTH \
20+
> "$OUTPUT_PATH/stdout.log" 2> "$OUTPUT_PATH/run_judge.log" &
1121
done
1222
done
1323
wait

project/paperbench/paperbench/judge/judge_eval/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ We curate the following 5 examples, available under `data/judge_eval/`:
3737
reproduce.sh and is in the state it would be after running it.
3838
2. `rice/0`: The authors' repository for `rice`. This submission does not come
3939
with a reproduce.sh script.
40-
3. `all-in-one/0`: The authors' repository for `all-in-one`. This submission does
41-
not come with a reproduce.sh script.
40+
3. `all-in-one/0`: The authors' repository for `all-in-one`. This submission
41+
does not come with a reproduce.sh script.
4242
4. `semantic-self-consistency/0`: An incomplete submission for the reproduction
4343
of `semantic-self-consistency` with only some parts of the paper implemented
4444
and reproduced. The submission includes a reproduce.sh and is in the state it
@@ -92,7 +92,11 @@ tar -cvf submission.tar submission
9292
## Running the evaluation
9393

9494
```bash
95-
python paperbench/scripts/run_judge_eval.py -j <JUDGE_TYPE> -m <MODEL_NAME> -o <OUTPUT_DIR>
95+
python paperbench/scripts/run_judge_eval.py \
96+
judge=simple \
97+
completer_config=preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config \
98+
completer_config.model=<MODEL_NAME> \
99+
output_dir=<OUTPUT_DIR>
96100
```
97101

98102
where `<OUTPUT_DIR>` is the directory to write the evaluation results to.

project/paperbench/paperbench/monitor/create_monitor.py

Lines changed: 0 additions & 26 deletions
This file was deleted.

project/paperbench/paperbench/monitor/monitor.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
1+
from __future__ import annotations
2+
13
import re
24
from abc import ABC, abstractmethod
35
from dataclasses import dataclass
46
from typing import Any
57

68
import blobfile as bf
79
import structlog.stdlib
10+
from pydantic import BaseModel
811
from unidecode import unidecode
912

1013
from paperbench.paper_registry import Paper
@@ -51,6 +54,14 @@ def to_dict(self) -> dict[str, Any]:
5154
class Monitor(ABC):
5255
"""Base class for monitoring agent behavior through logs."""
5356

57+
class Config(BaseModel, ABC):
58+
"""Serializable configuration for a :class:`Monitor`."""
59+
60+
@abstractmethod
61+
def build(self, paper: Paper) -> Monitor:
62+
"""Instantiate the monitor for the provided paper."""
63+
...
64+
5465
def __init__(
5566
self,
5667
paper: Paper,
@@ -74,12 +85,18 @@ def __init__(
7485
@abstractmethod
7586
def check_log(self, log_file: str) -> MonitorResult:
7687
"""Check a log file for violations of monitoring rules."""
77-
raise NotImplementedError()
88+
...
7889

7990

8091
class BasicMonitor(Monitor):
8192
"""Simple implementation that checks for occurrences of blacklisted terms with git clone, curl, or wget commands in agent logs."""
8293

94+
class Config(Monitor.Config):
95+
"""Configuration for :class:`BasicMonitor`."""
96+
97+
def build(self, paper: Paper) -> BasicMonitor:
98+
return BasicMonitor(paper=paper)
99+
83100
def _normalize_url(self, url: str) -> str:
84101
"""Normalize URL by removing protocol, parameters, and anchors."""
85102
# Remove protocol (http:// or https://)

project/paperbench/paperbench/nano/eval.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from nanoeval.solvers.computer_tasks.steps import FinalResult
1919
from nanoeval.solvers.computer_tasks.task import ComputerTask
2020
from paperbench.metrics import compute_agg_stats, per_paper_results
21+
from paperbench.monitor.monitor import BasicMonitor, Monitor
2122
from paperbench.nano.structs import (
2223
JudgeConfig,
2324
PaperBenchGrade,
@@ -48,6 +49,7 @@
4849
class PaperBench(PythonCodingEval):
4950
reproduction: ReproductionConfig = chz.field(default_factory=ReproductionConfig)
5051
judge: JudgeConfig = chz.field(default_factory=JudgeConfig)
52+
monitor_config: Monitor.Config = chz.field(default_factory=BasicMonitor.Config)
5153

5254
# task args
5355
paper_split: Literal["debug", "dev", "human", "testing", "all"] = chz.field(
@@ -153,6 +155,7 @@ async def get_instances(self) -> list[PBTask]:
153155
target_duration_hr=self.target_duration_hr,
154156
judge=self.judge,
155157
reproduction=self.reproduction,
158+
monitor_config=self.monitor_config,
156159
save_cluster_output_to_host=self.save_cluster_output_to_host,
157160
network_mode=NetworkMode.UNPROXIED
158161
if self.allow_internet

project/paperbench/paperbench/nano/task.py

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@
3838
WORKSPACE_BASE,
3939
)
4040
from paperbench.grade import JudgeOutput, grade_submission
41-
from paperbench.monitor.create_monitor import create_monitor
42-
from paperbench.monitor.monitor import MonitorResult
41+
from paperbench.monitor.monitor import Monitor, MonitorResult
4342
from paperbench.nano.structs import (
4443
JudgeConfig,
4544
PaperBenchGrade,
@@ -68,6 +67,7 @@ class PBTask(ComputerTask):
6867
target_duration_hr: int | None
6968
reproduction: ReproductionConfig
7069
judge: JudgeConfig
70+
monitor_config: Monitor.Config
7171
skipped_rollout: bool = False # whether rollouts were skipped (e.g. if we're resuming)
7272

7373
save_cluster_output_to_host: bool
@@ -334,10 +334,7 @@ def _should_monitor(self, log_file_path: str) -> bool:
334334
return False
335335

336336
def _run_monitor(self, log_file_path: str) -> MonitorResult:
337-
"""
338-
Runs the monitor on an given log file
339-
TODO: make this configurable through chz in `PaperBenchEval`
340-
"""
337+
"""Run the configured monitor on the given log file."""
341338
ctx_logger = logger.bind(
342339
run_group_id=self.run_group_id, run_id=self.run_id, runs_dir=self.runs_dir
343340
)
@@ -346,11 +343,7 @@ def _run_monitor(self, log_file_path: str) -> MonitorResult:
346343
f"Running monitor on {self.run_id} agent.log", destinations=["run"], _print=True
347344
)
348345
paper = paper_registry.get_paper(self.paper_id)
349-
monitor = create_monitor(
350-
monitor_type="basic",
351-
paper=paper,
352-
monitor_kwargs={},
353-
)
346+
monitor = self.monitor_config.build(paper=paper)
354347
monitor_result = monitor.check_log(log_file_path)
355348
return monitor_result
356349

0 commit comments

Comments
 (0)