Skip to content

Commit 031e09c

Browse files
committed
Fix GPQA and index extractive metric (#829)
* too many false positives with the current gpqa metric extraction, making it more string * fixing whitespace and instruction in prompt * better to have a strict extraction for index extraction in general actually * added comment * fix tests, need to invert condition
1 parent a549107 commit 031e09c

File tree

3 files changed

+18
-6
lines changed

3 files changed

+18
-6
lines changed

src/lighteval/metrics/utils/extractive_match_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,12 @@ class IndicesExtractionConfig:
8585
8686
Attributes:
8787
prefix_for_extraction (ChoicePrefix): The style to use for extracting choice indices (e.g. A,B,C or 1,2,3)
88-
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is"
88+
try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is".
89+
Recommended False for indices extraction, as some indices (for example `A` which is also a word) can lead to a lot of false positives.
8990
"""
9091

9192
prefix_for_extraction: ChoicePrefix
92-
try_extract_without_anchor: bool = True
93+
try_extract_without_anchor: bool = False
9394

9495

9596
ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig

src/lighteval/tasks/default_prompts.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -899,15 +899,24 @@ def gpqa_instruct(line, task_name: str = None):
899899
gold_index = random.randint(0, 3)
900900
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
901901
choices.insert(gold_index, line["Correct Answer"])
902-
query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
903-
query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
902+
instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
903+
query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
904+
query = query_template.format(
905+
# Stripping to avoid accidental extra whitespaces, present in GPQA
906+
A=choices[0].strip(),
907+
B=choices[1].strip(),
908+
C=choices[2].strip(),
909+
D=choices[3].strip(),
910+
Question=line["Question"].strip(),
911+
Instruction=instruction,
912+
)
904913

905914
return Doc(
906915
task_name=task_name,
907916
query=query,
908917
choices=LETTER_INDICES[: len(choices)],
909918
gold_index=gold_index,
910-
instruction=query,
919+
instruction=instruction,
911920
)
912921

913922

tests/metrics/test_extractive_match.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ def compare_strings(
5656
elif match_type == "expr":
5757
extraction_targets.append(ExprExtractionConfig())
5858
elif match_type == "NativeLetters":
59-
extraction_targets.append(IndicesExtractionConfig(prefix_for_extraction="NativeLetters"))
59+
extraction_targets.append(
60+
IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
61+
)
6062

6163
extraction_targets = tuple(extraction_targets) # Convert to tuple
6264

0 commit comments

Comments
 (0)