6
6
from .gpqa_eval import GPQAEval
7
7
from .aime_eval import AIME25Eval
8
8
from .healthbench_eval import HealthBenchEval
9
- from .chat_completion_sampler import (
9
+ from .chat_completions_sampler import (
10
10
OPENAI_SYSTEM_MESSAGE_API ,
11
- ChatCompletionSampler ,
11
+ ChatCompletionsSampler ,
12
12
)
13
13
from .responses_sampler import ResponsesSampler
14
14
@@ -19,12 +19,23 @@ def main():
19
19
formatter_class = argparse .ArgumentDefaultsHelpFormatter ,
20
20
)
21
21
parser .add_argument (
22
- "--list-models" , action = "store_true" , help = "List available models"
22
+ "--model" ,
23
+ type = str ,
24
+ default = "gpt-oss-120b,gpt-oss-20b" ,
25
+ help = "Select a model by name. Accepts a comma-separated list." ,
23
26
)
24
27
parser .add_argument (
25
- "--model" ,
28
+ "--reasoning-effort" ,
29
+ type = str ,
30
+ default = "low,medium,high" ,
31
+ help = "Reasoning effort (low, medium, high). Accepts a comma-separated list." ,
32
+ )
33
+ parser .add_argument (
34
+ "--sampler" ,
26
35
type = str ,
27
- help = "Select a model by name. Also accepts a comma-separated list of models." ,
36
+ choices = ["responses" , "chat_completions" ],
37
+ default = "responses" ,
38
+ help = "Sampler backend to use for models." ,
28
39
)
29
40
parser .add_argument (
30
41
"--base-url" ,
@@ -36,7 +47,7 @@ def main():
36
47
"--eval" ,
37
48
type = str ,
38
49
default = "gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25" ,
39
- help = "Select an eval by name. Also accepts a comma-separated list of evals ." ,
50
+ help = "Select an eval by name. Accepts a comma-separated list." ,
40
51
)
41
52
parser .add_argument (
42
53
"--temperature" ,
@@ -59,71 +70,26 @@ def main():
59
70
60
71
args = parser .parse_args ()
61
72
62
- models = {
63
- "120b-low" : ResponsesSampler (
64
- model = "gpt-oss-120b" ,
65
- reasoning_model = True ,
66
- reasoning_effort = "low" ,
67
- temperature = args .temperature ,
68
- base_url = args .base_url ,
69
- ),
70
- "120b" : ResponsesSampler (
71
- model = "gpt-oss-120b" ,
72
- reasoning_model = True ,
73
- reasoning_effort = "medium" ,
74
- temperature = args .temperature ,
75
- base_url = args .base_url ,
76
- ),
77
- "120b-high" : ResponsesSampler (
78
- model = "gpt-oss-120b" ,
79
- reasoning_model = True ,
80
- reasoning_effort = "high" ,
81
- temperature = args .temperature ,
82
- base_url = args .base_url ,
83
- ),
84
- "20b-low" : ResponsesSampler (
85
- model = "gpt-oss-20b" ,
86
- reasoning_model = True ,
87
- reasoning_effort = "low" ,
88
- temperature = args .temperature ,
89
- base_url = args .base_url ,
90
- ),
91
- "20b" : ResponsesSampler (
92
- model = "gpt-oss-20b" ,
93
- reasoning_model = True ,
94
- reasoning_effort = "medium" ,
95
- temperature = args .temperature ,
96
- base_url = args .base_url ,
97
- ),
98
- "20b-high" : ResponsesSampler (
99
- model = "gpt-oss-20b" ,
100
- reasoning_model = True ,
101
- reasoning_effort = "high" ,
102
- temperature = args .temperature ,
103
- base_url = args .base_url ,
104
- ),
105
- }
106
-
107
- if args .list_models :
108
- print ("Available models:" )
109
- for model_name in models .keys ():
110
- print (f" - { model_name } " )
111
- return
112
-
113
- if args .model :
114
- models_chosen = args .model .split ("," )
115
- for model_name in models_chosen :
116
- if model_name not in models :
117
- print (f"Error: Model '{ model_name } ' not found." )
118
- return
119
- models = {model_name : models [model_name ] for model_name in models_chosen }
73
+ sampler_cls = ResponsesSampler if args .sampler == "responses" else ChatCompletionsSampler
74
+
75
+ models = {}
76
+ for model_name in args .model .split ("," ):
77
+ for reasoning_effort in args .reasoning_effort .split ("," ):
78
+ models [f"{ model_name } -{ reasoning_effort } " ] = sampler_cls (
79
+ model = model_name ,
80
+ reasoning_model = True ,
81
+ reasoning_effort = reasoning_effort ,
82
+ temperature = args .temperature ,
83
+ base_url = args .base_url ,
84
+ )
120
85
121
86
print (f"Running with args { args } " )
122
87
123
- grading_sampler = ChatCompletionSampler (
88
+ grading_sampler = ChatCompletionsSampler (
124
89
model = "gpt-4.1-2025-04-14" ,
125
90
system_message = OPENAI_SYSTEM_MESSAGE_API ,
126
91
max_tokens = 2048 ,
92
+ base_url = "https://api.openai.com/v1" ,
127
93
)
128
94
129
95
def get_evals (eval_name , debug_mode ):
@@ -172,17 +138,15 @@ def get_evals(eval_name, debug_mode):
172
138
case _:
173
139
raise Exception (f"Unrecognized eval type: { eval_name } " )
174
140
175
- evals_list = args .eval .split ("," )
176
141
evals = {}
177
- for eval_name in evals_list :
142
+ for eval_name in args . eval . split ( "," ) :
178
143
evals [eval_name ] = get_evals (eval_name , args .debug )
179
144
180
- print (evals )
181
145
debug_suffix = "_DEBUG" if args .debug else ""
182
146
print (debug_suffix )
183
147
mergekey2resultpath = {}
184
- print (f"Running the following evals: { list ( evals . keys ()) } " )
185
- print (f"Running evals for the following models: { list ( models . keys ()) } " )
148
+ print (f"Running the following evals: { evals } " )
149
+ print (f"Running evals for the following models: { models } " )
186
150
187
151
now = datetime .now ()
188
152
date_str = now .strftime ("%Y%m%d_%H%M%S" )
@@ -220,6 +184,7 @@ def get_evals(eval_name, debug_mode):
220
184
print (f"Writing all results to { full_result_filename } " )
221
185
222
186
mergekey2resultpath [f"{ file_stem } " ] = result_filename
187
+
223
188
merge_metrics = []
224
189
for eval_model_name , result_filename in mergekey2resultpath .items ():
225
190
try :
0 commit comments