Skip to content

Commit 754a56b

Browse files
authored
evals: add chat completions API sampler (#59)
* evals: admit --sampler chat_completions * gpt_oss.evals: allow modifying the model names
1 parent 4931694 commit 754a56b

File tree

3 files changed

+65
-88
lines changed

3 files changed

+65
-88
lines changed

gpt_oss/evals/__main__.py

Lines changed: 35 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
from .gpqa_eval import GPQAEval
77
from .aime_eval import AIME25Eval
88
from .healthbench_eval import HealthBenchEval
9-
from .chat_completion_sampler import (
9+
from .chat_completions_sampler import (
1010
OPENAI_SYSTEM_MESSAGE_API,
11-
ChatCompletionSampler,
11+
ChatCompletionsSampler,
1212
)
1313
from .responses_sampler import ResponsesSampler
1414

@@ -19,12 +19,23 @@ def main():
1919
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
2020
)
2121
parser.add_argument(
22-
"--list-models", action="store_true", help="List available models"
22+
"--model",
23+
type=str,
24+
default="gpt-oss-120b,gpt-oss-20b",
25+
help="Select a model by name. Accepts a comma-separated list.",
2326
)
2427
parser.add_argument(
25-
"--model",
28+
"--reasoning-effort",
29+
type=str,
30+
default="low,medium,high",
31+
help="Reasoning effort (low, medium, high). Accepts a comma-separated list.",
32+
)
33+
parser.add_argument(
34+
"--sampler",
2635
type=str,
27-
help="Select a model by name. Also accepts a comma-separated list of models.",
36+
choices=["responses", "chat_completions"],
37+
default="responses",
38+
help="Sampler backend to use for models.",
2839
)
2940
parser.add_argument(
3041
"--base-url",
@@ -36,7 +47,7 @@ def main():
3647
"--eval",
3748
type=str,
3849
default="gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25",
39-
help="Select an eval by name. Also accepts a comma-separated list of evals.",
50+
help="Select an eval by name. Accepts a comma-separated list.",
4051
)
4152
parser.add_argument(
4253
"--temperature",
@@ -59,71 +70,26 @@ def main():
5970

6071
args = parser.parse_args()
6172

62-
models = {
63-
"120b-low": ResponsesSampler(
64-
model="gpt-oss-120b",
65-
reasoning_model=True,
66-
reasoning_effort="low",
67-
temperature=args.temperature,
68-
base_url=args.base_url,
69-
),
70-
"120b": ResponsesSampler(
71-
model="gpt-oss-120b",
72-
reasoning_model=True,
73-
reasoning_effort="medium",
74-
temperature=args.temperature,
75-
base_url=args.base_url,
76-
),
77-
"120b-high": ResponsesSampler(
78-
model="gpt-oss-120b",
79-
reasoning_model=True,
80-
reasoning_effort="high",
81-
temperature=args.temperature,
82-
base_url=args.base_url,
83-
),
84-
"20b-low": ResponsesSampler(
85-
model="gpt-oss-20b",
86-
reasoning_model=True,
87-
reasoning_effort="low",
88-
temperature=args.temperature,
89-
base_url=args.base_url,
90-
),
91-
"20b": ResponsesSampler(
92-
model="gpt-oss-20b",
93-
reasoning_model=True,
94-
reasoning_effort="medium",
95-
temperature=args.temperature,
96-
base_url=args.base_url,
97-
),
98-
"20b-high": ResponsesSampler(
99-
model="gpt-oss-20b",
100-
reasoning_model=True,
101-
reasoning_effort="high",
102-
temperature=args.temperature,
103-
base_url=args.base_url,
104-
),
105-
}
106-
107-
if args.list_models:
108-
print("Available models:")
109-
for model_name in models.keys():
110-
print(f" - {model_name}")
111-
return
112-
113-
if args.model:
114-
models_chosen = args.model.split(",")
115-
for model_name in models_chosen:
116-
if model_name not in models:
117-
print(f"Error: Model '{model_name}' not found.")
118-
return
119-
models = {model_name: models[model_name] for model_name in models_chosen}
73+
sampler_cls = ResponsesSampler if args.sampler == "responses" else ChatCompletionsSampler
74+
75+
models = {}
76+
for model_name in args.model.split(","):
77+
for reasoning_effort in args.reasoning_effort.split(","):
78+
models[f"{model_name}-{reasoning_effort}"] = sampler_cls(
79+
model=model_name,
80+
reasoning_model=True,
81+
reasoning_effort=reasoning_effort,
82+
temperature=args.temperature,
83+
base_url=args.base_url,
84+
)
12085

12186
print(f"Running with args {args}")
12287

123-
grading_sampler = ChatCompletionSampler(
88+
grading_sampler = ChatCompletionsSampler(
12489
model="gpt-4.1-2025-04-14",
12590
system_message=OPENAI_SYSTEM_MESSAGE_API,
12691
max_tokens=2048,
92+
base_url="https://api.openai.com/v1",
12793
)
12894

12995
def get_evals(eval_name, debug_mode):
@@ -172,17 +138,15 @@ def get_evals(eval_name, debug_mode):
172138
case _:
173139
raise Exception(f"Unrecognized eval type: {eval_name}")
174140

175-
evals_list = args.eval.split(",")
176141
evals = {}
177-
for eval_name in evals_list:
142+
for eval_name in args.eval.split(","):
178143
evals[eval_name] = get_evals(eval_name, args.debug)
179144

180-
print(evals)
181145
debug_suffix = "_DEBUG" if args.debug else ""
182146
print(debug_suffix)
183147
mergekey2resultpath = {}
184-
print(f"Running the following evals: {list(evals.keys())}")
185-
print(f"Running evals for the following models: {list(models.keys())}")
148+
print(f"Running the following evals: {evals}")
149+
print(f"Running evals for the following models: {models}")
186150

187151
now = datetime.now()
188152
date_str = now.strftime("%Y%m%d_%H%M%S")
@@ -220,6 +184,7 @@ def get_evals(eval_name, debug_mode):
220184
print(f"Writing all results to {full_result_filename}")
221185

222186
mergekey2resultpath[f"{file_stem}"] = result_filename
187+
223188
merge_metrics = []
224189
for eval_model_name, result_filename in mergekey2resultpath.items():
225190
try:

gpt_oss/evals/chat_completion_sampler.py renamed to gpt_oss/evals/chat_completions_sampler.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,38 @@
66

77
from .types import MessageList, SamplerBase, SamplerResponse
88

9+
910
OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
1011
OPENAI_SYSTEM_MESSAGE_CHATGPT = (
1112
"You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture."
1213
+ "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
1314
)
1415

1516

16-
class ChatCompletionSampler(SamplerBase):
17-
"""
18-
Sample from OpenAI's chat completion API
19-
"""
17+
class ChatCompletionsSampler(SamplerBase):
18+
"""Sample from a Chat Completions compatible API."""
2019

2120
def __init__(
2221
self,
2322
model: str = "gpt-3.5-turbo",
2423
system_message: str | None = None,
2524
temperature: float = 0.5,
2625
max_tokens: int = 1024,
26+
reasoning_model: bool = False,
27+
reasoning_effort: str | None = None,
28+
base_url: str = "http://localhost:8000/v1",
2729
):
2830
self.api_key_name = "OPENAI_API_KEY"
29-
self.client = OpenAI()
30-
# using api_key=os.environ.get("OPENAI_API_KEY") # please set your API_KEY
31+
self.client = OpenAI(base_url=base_url, timeout=24 * 60 * 60)
3132
self.model = model
3233
self.system_message = system_message
3334
self.temperature = temperature
3435
self.max_tokens = max_tokens
36+
self.reasoning_model = reasoning_model
37+
self.reasoning_effort = reasoning_effort
3538
self.image_format = "url"
3639

37-
def _pack_message(self, role: str, content: Any):
40+
def _pack_message(self, role: str, content: Any) -> dict[str, Any]:
3841
return {"role": str(role), "content": content}
3942

4043
def __call__(self, message_list: MessageList) -> SamplerResponse:
@@ -45,12 +48,21 @@ def __call__(self, message_list: MessageList) -> SamplerResponse:
4548
trial = 0
4649
while True:
4750
try:
48-
response = self.client.chat.completions.create(
49-
model=self.model,
50-
messages=message_list,
51-
temperature=self.temperature,
52-
max_tokens=self.max_tokens,
53-
)
51+
if self.reasoning_model:
52+
response = self.client.chat.completions.create(
53+
model=self.model,
54+
messages=message_list,
55+
reasoning_effort=self.reasoning_effort,
56+
temperature=self.temperature,
57+
max_tokens=self.max_tokens,
58+
)
59+
else:
60+
response = self.client.chat.completions.create(
61+
model=self.model,
62+
messages=message_list,
63+
temperature=self.temperature,
64+
max_tokens=self.max_tokens,
65+
)
5466
content = response.choices[0].message.content
5567
if content is None:
5668
raise ValueError("OpenAI API returned empty response; retrying")
@@ -59,7 +71,6 @@ def __call__(self, message_list: MessageList) -> SamplerResponse:
5971
response_metadata={"usage": response.usage},
6072
actual_queried_message_list=message_list,
6173
)
62-
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
6374
except openai.BadRequestError as e:
6475
print("Bad Request Error", e)
6576
return SamplerResponse(
@@ -68,7 +79,7 @@ def __call__(self, message_list: MessageList) -> SamplerResponse:
6879
actual_queried_message_list=message_list,
6980
)
7081
except Exception as e:
71-
exception_backoff = 2**trial # expontial back off
82+
exception_backoff = 2 ** trial # exponential back off
7283
print(
7384
f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
7485
e,

gpt_oss/evals/healthbench_eval.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
import numpy as np
2727

2828
from . import report
29-
from .chat_completion_sampler import (
29+
from .chat_completions_sampler import (
3030
OPENAI_SYSTEM_MESSAGE_API,
31-
ChatCompletionSampler,
31+
ChatCompletionsSampler,
3232
)
3333
from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult
3434

@@ -540,10 +540,11 @@ def physician_completions_main(
540540
now = datetime.now()
541541
date_str = now.strftime("%Y%m%d_%H%M")
542542

543-
grading_sampler = ChatCompletionSampler(
543+
grading_sampler = ChatCompletionsSampler(
544544
model="gpt-4.1-2025-04-14",
545545
system_message=OPENAI_SYSTEM_MESSAGE_API,
546546
max_tokens=2048,
547+
base_url="https://api.openai.com/v1",
547548
)
548549
dummy_sampler = SamplerBase()
549550

0 commit comments

Comments
 (0)