Skip to content

Commit ad9d1bc

Browse files
use CLI/chz/TurnCompleter in run judge scripts
1 parent 8503053 commit ad9d1bc

8 files changed

Lines changed: 219 additions & 188 deletions

File tree

project/paperbench/experiments/judge_eval/judge_eval_perf_cost.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from matplotlib import container
66
from matplotlib.ticker import MultipleLocator
77

8+
from paperbench.utils import normalize_not_given
9+
810
MODEL_COST_PER_TOKEN = {
911
"gpt-4o-2024-08-06": {"in": 0.0000025, "out": 0.00001},
1012
"gpt-4o": {"in": 0.0000025, "out": 0.00001},
@@ -72,10 +74,23 @@ def read_model_results() -> dict:
7274
with open(path, "r") as f:
7375
data = json.load(f)
7476

75-
model_name = data["model_name"]
76-
reasoning_effort = (
77-
data.get("judge_kwargs", {}).get("completion_kwargs", {}).get("reasoning_effort", None)
78-
)
77+
judge_kwargs = data.get("judge_kwargs", {})
78+
completer_config = judge_kwargs.get("completer_config")
79+
80+
if completer_config is not None:
81+
model_name = completer_config.get("model")
82+
reasoning_effort = normalize_not_given(completer_config.get("reasoning_effort"))
83+
else:
84+
# support older format
85+
model_name = data.get("model_name")
86+
reasoning_effort = (
87+
data.get("judge_kwargs", {})
88+
.get("completion_kwargs", {})
89+
.get("reasoning_effort", None)
90+
)
91+
reasoning_effort = normalize_not_given(reasoning_effort)
92+
model_name = model_name or "unknown_model"
93+
7994
key = f"{model_name}_{reasoning_effort}" if reasoning_effort else model_name
8095
model_results[key] = {
8196
"model_name": model_name,

project/paperbench/experiments/judge_eval/judge_eval_perf_tables.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import pandas as pd
44

5+
from paperbench.utils import normalize_not_given
6+
57

68
def read_model_results() -> dict:
79
"""
@@ -22,10 +24,23 @@ def read_model_results() -> dict:
2224
with open(path, "r") as f:
2325
data = json.load(f)
2426

25-
model_name = data["model_name"]
26-
reasoning_effort = (
27-
data.get("judge_kwargs", {}).get("completion_kwargs", {}).get("reasoning_effort", None)
28-
)
27+
judge_kwargs = data.get("judge_kwargs", {})
28+
completer_config = judge_kwargs.get("completer_config")
29+
30+
if completer_config is not None:
31+
model_name = completer_config.get("model")
32+
reasoning_effort = normalize_not_given(completer_config.get("reasoning_effort"))
33+
else:
34+
# support older format
35+
model_name = data.get("model_name")
36+
reasoning_effort = (
37+
data.get("judge_kwargs", {})
38+
.get("completion_kwargs", {})
39+
.get("reasoning_effort", None)
40+
)
41+
reasoning_effort = normalize_not_given(reasoning_effort)
42+
model_name = model_name or "unknown_model"
43+
2944
key = f"{model_name}_{reasoning_effort}" if reasoning_effort else model_name
3045
model_results[key] = {
3146
"model_name": model_name,

project/paperbench/experiments/judge_eval/run_judge_eval_sweep.sh

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,55 @@
22

33
export OPENAI_API_KEY="<JUDGE_EVAL_API_KEY>"
44

5+
COMPLETER_CONFIG="preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config"
6+
EXAMPLE_IDS="pinn/0,rice/0,stay-on-topic-with-classifier-free-guidance/0,all-in-one/0,semantic-self-consistency/0"
7+
OUTPUT_DIR="experiments/judge_eval/judge_eval_results/"
8+
59
if [ "$OPENAI_API_KEY" = "<JUDGE_EVAL_API_KEY>" ]; then
610
echo "Error: Please set a valid OpenAI API key in the script. Replace <JUDGE_EVAL_API_KEY> with the judge eval API key."
711
exit 1
812
fi
913

1014
for model in o3-mini-2025-01-31 o1-2024-12-17 o1-mini-2024-09-12; do
1115
echo "Running judge eval for $model-high"
12-
python paperbench/scripts/run_judge_eval.py -j simple -m $model \
13-
--reasoning-effort high \
14-
--output-dir experiments/judge_eval/judge_eval_results/ \
15-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
16+
python paperbench/scripts/run_judge_eval.py \
17+
judge=simple \
18+
completer_config="$COMPLETER_CONFIG" \
19+
completer_config.model=$model \
20+
completer_config.reasoning_effort=high \
21+
output_dir=$OUTPUT_DIR \
22+
example_ids=$EXAMPLE_IDS
1623
echo "-----------------------------"
1724
done
1825

1926
for model in gpt-4o-mini-2024-07-18 gpt-4o-2024-08-06; do
2027
echo "Running judge eval for $model"
21-
python paperbench/scripts/run_judge_eval.py -j simple -m $model \
22-
--reasoning-effort none \
23-
--output-dir experiments/judge_eval/judge_eval_results/ \
24-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
28+
python paperbench/scripts/run_judge_eval.py \
29+
judge=simple \
30+
completer_config="$COMPLETER_CONFIG" \
31+
completer_config.model=$model \
32+
output_dir=$OUTPUT_DIR \
33+
example_ids=$EXAMPLE_IDS
2534
echo "-----------------------------"
2635
done
2736

28-
python paperbench/scripts/run_judge_eval.py -j random \
29-
--output-dir experiments/judge_eval/judge_eval_results/ \
30-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
37+
python paperbench/scripts/run_judge_eval.py \
38+
judge=random \
39+
output_dir=$OUTPUT_DIR \
40+
example_ids=$EXAMPLE_IDS
3141

32-
python paperbench/scripts/run_judge_eval.py -j dummy \
33-
--output-dir experiments/judge_eval/judge_eval_results/ \
34-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0
42+
python paperbench/scripts/run_judge_eval.py \
43+
judge=dummy \
44+
output_dir=$OUTPUT_DIR \
45+
example_ids=$EXAMPLE_IDS
3546

3647
# finally, single run of judge-eval on o3-mini-high with --code-only
3748
# to be able to compare token counts with default PaperBench
38-
python paperbench/scripts/run_judge_eval.py -j simple -m o3-mini-2025-01-31 \
39-
--reasoning-effort high \
40-
--output-dir experiments/judge_eval/judge_eval_results/code_only \
41-
--example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 \
42-
--code-only
49+
python paperbench/scripts/run_judge_eval.py \
50+
judge=simple \
51+
completer_config="$COMPLETER_CONFIG" \
52+
completer_config.model=o3-mini-2025-01-31 \
53+
completer_config.reasoning_effort=high \
54+
output_dir=experiments/judge_eval/judge_eval_results/code_only \
55+
example_ids=$EXAMPLE_IDS \
56+
code_only=true

project/paperbench/experiments/judge_max_depth/run.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,21 @@ SUBMISSION_PATH="data/judge_eval/rice/0/submission"
33
PAPER_ID="rice"
44
JUDGE="simple"
55
MODEL="o3-mini-2025-01-31"
6+
COMPLETER_CONFIG="preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config"
67
for DEPTH in 1 2 3 4 100; do
78
for SEED in 1 2 3; do
89
OUTPUT_PATH="experiments/judge_max_depth/results/judge_depth${DEPTH}_seed${SEED}"
910
mkdir -p $OUTPUT_PATH
10-
python paperbench/scripts/run_judge.py --submission-path $SUBMISSION_PATH --paper-id $PAPER_ID --judge $JUDGE --model $MODEL --out-dir $OUTPUT_PATH -d $DEPTH > "$OUTPUT_PATH/stdout.log" 2> "$OUTPUT_PATH/run_judge.log" &
11+
python paperbench/scripts/run_judge.py \
12+
submission_path=$SUBMISSION_PATH \
13+
paper_id=$PAPER_ID \
14+
judge=$JUDGE \
15+
completer_config="$COMPLETER_CONFIG" \
16+
completer_config.model=$MODEL \
17+
completer_config.reasoning_effort="high" \
18+
out_dir=$OUTPUT_PATH \
19+
max_depth=$DEPTH \
20+
> "$OUTPUT_PATH/stdout.log" 2> "$OUTPUT_PATH/run_judge.log" &
1121
done
1222
done
1323
wait

project/paperbench/paperbench/judge/judge_eval/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ We curate the following 5 examples, available under `data/judge_eval/`:
3737
reproduce.sh and is in the state it would be after running it.
3838
2. `rice/0`: The authors' repository for `rice`. This submission does not come
3939
with a reproduce.sh script.
40-
3. `all-in-one/0`: The authors' repository for `all-in-one`. This submission does
41-
not come with a reproduce.sh script.
40+
3. `all-in-one/0`: The authors' repository for `all-in-one`. This submission
41+
does not come with a reproduce.sh script.
4242
4. `semantic-self-consistency/0`: An incomplete submission for the reproduction
4343
of `semantic-self-consistency` with only some parts of the paper implemented
4444
and reproduced. The submission includes a reproduce.sh and is in the state it
@@ -92,7 +92,11 @@ tar -cvf submission.tar submission
9292
## Running the evaluation
9393

9494
```bash
95-
python paperbench/scripts/run_judge_eval.py -j <JUDGE_TYPE> -m <MODEL_NAME> -o <OUTPUT_DIR>
95+
python paperbench/scripts/run_judge_eval.py \
96+
judge=simple \
97+
completer_config=preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config \
98+
completer_config.model=<MODEL_NAME> \
99+
output_dir=<OUTPUT_DIR>
96100
```
97101

98102
where `<OUTPUT_DIR>` is the directory to write the evaluation results to.
Lines changed: 54 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,47 @@
1-
import argparse
21
import asyncio
32
import json
43
from pathlib import Path
4+
from typing import Literal
55

66
import structlog.stdlib
7-
from preparedness_turn_completer.oai_completions_turn_completer import (
8-
OpenAICompletionsTurnCompleter,
9-
)
107
from preparedness_turn_completer.turn_completer import TurnCompleter
118

9+
import chz
1210
from paperbench.grade import JudgeOutput, run_judge
1311
from paperbench.judge.token_usage import get_total_token_usage
1412
from paperbench.utils import get_timestamp
1513

1614
logger = structlog.stdlib.get_logger(component=__name__)
1715

1816

17+
@chz.chz
18+
class RunJudgeCliArgs:
19+
submission_path: Path = chz.field(doc="Path to the submission directory.")
20+
paper_id: str = chz.field(doc="Identifier for the paper.")
21+
judge: Literal["dummy", "random", "simple"] = chz.field(
22+
default="dummy",
23+
doc="Specify the judge to use.",
24+
)
25+
max_depth: int = chz.field(default=999, doc="Maximum depth to grade.")
26+
out_dir: Path = chz.field(doc="Path to store the judge's outputs.")
27+
code_only: bool = chz.field(
28+
default=False,
29+
doc="Set to True to only grade 'Code Development' nodes.",
30+
)
31+
resources_provided: bool = chz.field(
32+
default=False,
33+
doc="Set to True to weight 'Dataset and Model Acquisition' nodes to 0.",
34+
)
35+
completer_config: TurnCompleter.Config | None = chz.field(
36+
default=None,
37+
doc=(
38+
"TurnCompleter configuration to use for judges that require a completer. "
39+
"Example: completer_config=preparedness_turn_completer.oai_completions_turn_completer:"
40+
"OpenAICompletionsTurnCompleter.Config"
41+
),
42+
)
43+
44+
1945
async def main(
2046
submission_path: Path,
2147
paper_id: str,
@@ -62,85 +88,31 @@ async def main(
6288
json.dump(judge_output.to_dict(), f, indent=4)
6389

6490

65-
if __name__ == "__main__":
66-
parser = argparse.ArgumentParser(description="Grade a paper submission.")
67-
parser.add_argument(
68-
"--submission-path",
69-
type=Path,
70-
help="Path to the submission directory.",
71-
required=True,
72-
)
73-
parser.add_argument(
74-
"--paper-id",
75-
help="Identifier for the paper.",
76-
required=True,
77-
)
78-
parser.add_argument(
79-
"-j",
80-
"--judge",
81-
choices=["dummy", "random", "simple"],
82-
default="dummy",
83-
help="Specify the judge to use (default: dummy).",
84-
)
85-
parser.add_argument(
86-
"-m",
87-
"--model",
88-
help="Specify the OpenAI model to use (required if judge isn't dummy).",
89-
)
90-
parser.add_argument(
91-
"-d",
92-
"--max-depth",
93-
type=int,
94-
default=999,
95-
help="Specify the maximum depth to grade.",
96-
)
97-
parser.add_argument(
98-
"--out-dir",
99-
type=Path,
100-
help="Path to store the judge's outputs.",
101-
required=True,
102-
)
103-
parser.add_argument(
104-
"--code-only",
105-
action="store_true",
106-
help="Only grade 'Code Development' nodes",
107-
)
108-
parser.add_argument(
109-
"--resources-provided",
110-
action="store_true",
111-
help="Weight 'Dataset and Model Acquisition' nodes to 0",
112-
)
113-
parser.add_argument(
114-
"--reasoning-effort",
115-
choices=["low", "medium", "high", "none"],
116-
required=False,
117-
default="high",
118-
help="Reasoning effort to use for the completion, if using a model/judge that supports it.",
119-
)
91+
def _resolve_completer_config(args: RunJudgeCliArgs) -> TurnCompleter.Config | None:
92+
completer_config = args.completer_config
93+
if completer_config is not None:
94+
return completer_config
12095

121-
args = parser.parse_args()
96+
if args.judge == "simple":
97+
raise ValueError("When using the simple judge you must provide `completer_config`.")
12298

123-
if args.judge == "simple" and not args.model:
124-
parser.error("--model is required when using simple judge.")
99+
return None
125100

126-
completer_config = (
127-
# TODO: allow user to specify TurnCompleter.Config class
128-
OpenAICompletionsTurnCompleter.Config(
129-
model=args.model,
130-
reasoning_effort=None if args.reasoning_effort == "none" else args.reasoning_effort,
131-
)
132-
if args.judge == "simple"
133-
else None
134-
)
135-
asyncio.run(
136-
main(
137-
submission_path=args.submission_path,
138-
paper_id=args.paper_id,
139-
judge_type=args.judge,
140-
max_depth=args.max_depth,
141-
out_dir=args.out_dir,
142-
code_only=args.code_only,
143-
completer_config=completer_config,
144-
resources_provided=args.resources_provided,
145-
)
101+
102+
async def _main_from_cli(args: RunJudgeCliArgs) -> None:
103+
completer_config = _resolve_completer_config(args)
104+
await main(
105+
submission_path=args.submission_path,
106+
paper_id=args.paper_id,
107+
judge_type=args.judge,
108+
max_depth=args.max_depth,
109+
out_dir=args.out_dir,
110+
code_only=args.code_only,
111+
completer_config=completer_config,
112+
resources_provided=args.resources_provided,
146113
)
114+
115+
116+
if __name__ == "__main__":
117+
cli_args = chz.entrypoint(RunJudgeCliArgs, allow_hyphens=True)
118+
asyncio.run(_main_from_cli(cli_args))

0 commit comments

Comments
 (0)