Skip to content

Commit 57c0024

Browse files
committed
Update eval_simpleqa_benchmark.py
1 parent 5978e8f commit 57c0024

File tree

1 file changed

+19
-1
lines changed

1 file changed

+19
-1
lines changed

scripts/eval_simpleqa_benchmark.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,23 @@
6363
Grade (A/B/C):"""
6464

6565

66+
def remove_thinking_blocks(text: str) -> str:
67+
"""Remove <think>...</think> blocks from the response."""
68+
if not text:
69+
return text
70+
71+
if '</think>' in text:
72+
# Get everything after the last </think> tag
73+
parts = text.split('</think>')
74+
return parts[-1].strip()
75+
elif '<think>' in text and '</think>' not in text:
76+
# Handle truncated responses (no closing tag)
77+
parts = text.split('<think>')
78+
return parts[0].strip() if len(parts) > 1 and parts[0] else ""
79+
80+
return text
81+
82+
6683
class SimpleQAEvaluator:
6784
"""Main evaluator class for SimpleQA benchmark"""
6885

@@ -237,6 +254,7 @@ def query_optillm(self, question: str) -> Tuple[str, bool]:
237254
)
238255

239256
answer = response.choices[0].message.content
257+
answer = remove_thinking_blocks(answer)
240258
logger.debug(f"Response: {answer}")
241259

242260
return answer, True
@@ -360,7 +378,7 @@ def save_results(self, timestamp: str) -> Tuple[str, str, str]:
360378
"""Save evaluation results to files"""
361379
# Create output directory for this run
362380
run_dir = self.output_dir / f"simpleqa_{self.model}_{self.approach}"
363-
run_dir.mkdir(exist_ok=True)
381+
run_dir.mkdir(parents=True, exist_ok=True)
364382

365383
# File paths
366384
detailed_file = run_dir / f"{timestamp}_detailed.json"

0 commit comments

Comments
 (0)