Fix GPQA and index extractive metric (#829)

clefourrier · clefourrier · commit 031e09ced1fc · 2025-06-26T09:23:37.000Z
* too many false positives with the current gpqa metric extraction, making it more string

* fixing whitespace and instruction in prompt

* better to have a strict extraction for index extraction in general actually

* added comment

* fix tests, need to invert condition
diff --git a/src/lighteval/metrics/utils/extractive_match_utils.py b/src/lighteval/metrics/utils/extractive_match_utils.py
@@ -85,11 +85,12 @@ class IndicesExtractionConfig:
 
     Attributes:
         prefix_for_extraction (ChoicePrefix): The style to use for extracting choice indices (e.g. A,B,C or 1,2,3)
-        try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is"
+        try_extract_without_anchor (bool): Whether to try extracting indices without requiring specific anchors like "answer:" or "final answer is".
+            Recommended False for indices extraction, as some indices (for example `A` which is also a word) can lead to a lot of false positives.
     """
 
     prefix_for_extraction: ChoicePrefix
-    try_extract_without_anchor: bool = True
+    try_extract_without_anchor: bool = False
 
 
 ExtractionTarget = LatexExtractionConfig | ExprExtractionConfig | IndicesExtractionConfig
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -899,15 +899,24 @@ def gpqa_instruct(line, task_name: str = None):
     gold_index = random.randint(0, 3)
     choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
     choices.insert(gold_index, line["Correct Answer"])
-    query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
-    query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
+    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
+    query_template = "{Instruction}\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
+    query = query_template.format(
+        # Stripping to avoid accidental extra whitespaces, present in GPQA
+        A=choices[0].strip(),
+        B=choices[1].strip(),
+        C=choices[2].strip(),
+        D=choices[3].strip(),
+        Question=line["Question"].strip(),
+        Instruction=instruction,
+    )
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES[: len(choices)],
         gold_index=gold_index,
-        instruction=query,
+        instruction=instruction,
     )
 
 
diff --git a/tests/metrics/test_extractive_match.py b/tests/metrics/test_extractive_match.py
@@ -56,7 +56,9 @@ def compare_strings(
         elif match_type == "expr":
             extraction_targets.append(ExprExtractionConfig())
         elif match_type == "NativeLetters":
-            extraction_targets.append(IndicesExtractionConfig(prefix_for_extraction="NativeLetters"))
+            extraction_targets.append(
+                IndicesExtractionConfig(prefix_for_extraction="NativeLetters", try_extract_without_anchor=True)
+            )
 
     extraction_targets = tuple(extraction_targets)  # Convert to tuple