|
2 | 2 |
|
3 | 3 | export OPENAI_API_KEY="<JUDGE_EVAL_API_KEY>" |
4 | 4 |
|
| 5 | +COMPLETER_CONFIG="preparedness_turn_completer.oai_completions_turn_completer:OpenAICompletionsTurnCompleter.Config" |
| 6 | +EXAMPLE_IDS="pinn/0,rice/0,stay-on-topic-with-classifier-free-guidance/0,all-in-one/0,semantic-self-consistency/0" |
| 7 | +OUTPUT_DIR="experiments/judge_eval/judge_eval_results/" |
| 8 | + |
5 | 9 | if [ "$OPENAI_API_KEY" = "<JUDGE_EVAL_API_KEY>" ]; then |
6 | 10 | echo "Error: Please set a valid OpenAI API key in the script. Replace <JUDGE_EVAL_API_KEY> with the judge eval API key." |
7 | 11 | exit 1 |
8 | 12 | fi |
9 | 13 |
|
10 | 14 | for model in o3-mini-2025-01-31 o1-2024-12-17 o1-mini-2024-09-12; do |
11 | 15 | echo "Running judge eval for $model-high" |
12 | | - python paperbench/scripts/run_judge_eval.py -j simple -m $model \ |
13 | | - --reasoning-effort high \ |
14 | | - --output-dir experiments/judge_eval/judge_eval_results/ \ |
15 | | - --example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 |
| 16 | + python paperbench/scripts/run_judge_eval.py \ |
| 17 | + judge=simple \ |
| 18 | + completer_config="$COMPLETER_CONFIG" \ |
| 19 | + completer_config.model=$model \ |
| 20 | + completer_config.reasoning_effort=high \ |
| 21 | + output_dir=$OUTPUT_DIR \ |
| 22 | + example_ids=$EXAMPLE_IDS |
16 | 23 | echo "-----------------------------" |
17 | 24 | done |
18 | 25 |
|
19 | 26 | for model in gpt-4o-mini-2024-07-18 gpt-4o-2024-08-06; do |
20 | 27 | echo "Running judge eval for $model" |
21 | | - python paperbench/scripts/run_judge_eval.py -j simple -m $model \ |
22 | | - --reasoning-effort none \ |
23 | | - --output-dir experiments/judge_eval/judge_eval_results/ \ |
24 | | - --example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 |
| 28 | + python paperbench/scripts/run_judge_eval.py \ |
| 29 | + judge=simple \ |
| 30 | + completer_config="$COMPLETER_CONFIG" \ |
| 31 | + completer_config.model=$model \ |
| 32 | + output_dir=$OUTPUT_DIR \ |
| 33 | + example_ids=$EXAMPLE_IDS |
25 | 34 | echo "-----------------------------" |
26 | 35 | done |
27 | 36 |
|
28 | | -python paperbench/scripts/run_judge_eval.py -j random \ |
29 | | - --output-dir experiments/judge_eval/judge_eval_results/ \ |
30 | | - --example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 |
| 37 | +python paperbench/scripts/run_judge_eval.py \ |
| 38 | + judge=random \ |
| 39 | + output_dir=$OUTPUT_DIR \ |
| 40 | + example_ids=$EXAMPLE_IDS |
31 | 41 |
|
32 | | -python paperbench/scripts/run_judge_eval.py -j dummy \ |
33 | | - --output-dir experiments/judge_eval/judge_eval_results/ \ |
34 | | - --example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 |
| 42 | +python paperbench/scripts/run_judge_eval.py \ |
| 43 | + judge=dummy \ |
| 44 | + output_dir=$OUTPUT_DIR \ |
| 45 | + example_ids=$EXAMPLE_IDS |
35 | 46 |
|
36 | 47 | # finally, single run of judge-eval on o3-mini-high with --code-only |
37 | 48 | # to be able to compare token counts with default PaperBench |
38 | | -python paperbench/scripts/run_judge_eval.py -j simple -m o3-mini-2025-01-31 \ |
39 | | - --reasoning-effort high \ |
40 | | - --output-dir experiments/judge_eval/judge_eval_results/code_only \ |
41 | | - --example-ids pinn/0 rice/0 stay-on-topic-with-classifier-free-guidance/0 all-in-one/0 semantic-self-consistency/0 \ |
42 | | - --code-only |
| 49 | +python paperbench/scripts/run_judge_eval.py \ |
| 50 | + judge=simple \ |
| 51 | + completer_config="$COMPLETER_CONFIG" \ |
| 52 | + completer_config.model=o3-mini-2025-01-31 \ |
| 53 | + completer_config.reasoning_effort=high \ |
| 54 | + output_dir=experiments/judge_eval/judge_eval_results/code_only \ |
| 55 | + example_ids=$EXAMPLE_IDS \ |
| 56 | + code_only=true |
0 commit comments