Skip to content

Commit fe98303

Browse files
committed
Update eval_simpleqa_benchmark.py
1 parent 1922426 commit fe98303

File tree

1 file changed

+7
-2
lines changed

1 file changed

+7
-2
lines changed

scripts/eval_simpleqa_benchmark.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,8 @@ def query_optillm(self, question: str) -> Tuple[str, bool]:
231231
response = self.optillm_client.chat.completions.create(
232232
model=model_name,
233233
messages=messages,
234-
extra_body=extra_body if extra_body else None
234+
extra_body=extra_body if extra_body else None,
235+
max_tokens=4096
235236
)
236237

237238
answer = response.choices[0].message.content
@@ -258,11 +259,15 @@ def grade_response(self, question: str, gold_answer: str, response: str) -> str:
258259
grader_response = self.grader_client.chat.completions.create(
259260
model=self.grader_model,
260261
messages=[{"role": "user", "content": grading_prompt}],
261-
temperature=0.0
262+
temperature=0.0,
263+
max_tokens=4096
262264
)
263265

264266
grade_text = grader_response.choices[0].message.content.strip()
265267

268+
# Strip <think> tags if present
269+
grade_text = re.sub(r'<think>.*?</think>', '', grade_text, flags=re.DOTALL).strip()
270+
266271
# Extract grade (A/B/C)
267272
if grade_text.startswith('A'):
268273
return "CORRECT"

0 commit comments

Comments
 (0)