Tune prompt

lewtun · lewtun · commit 8b47e015bcf3 · 2025-02-05T10:14:29.000Z
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -730,21 +730,19 @@ def gpqa(line, task_name: str = None):
 
 
 def gpqa_instruct(line, task_name: str = None):
-    """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
+    """Adapted from Llama 3 evals: https://huggingface.co/datasets/meta-llama/Llama-3.1-8B-Instruct-evals/viewer/Llama-3.1-8B-Instruct-evals__gpqa__details"""
     gold_index = random.randint(0, 3)
     choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
     choices.insert(gold_index, line["Correct Answer"])
-
-    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
-    query = f"{instruction}\n\n{line['Question']}\n\n" ""
-    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
+    query_template = """Given the following question and four candidate answers (A, B, C and D), choose the best answer.\n\nQuestion: {Question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nYour response should end with \"The best answer is [the_answer_letter]\" where the [the_answer_letter] is one of A, B, C or D. Think step by step before answering."""
+    query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
 
     return Doc(
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES[: len(choices)],
         gold_index=gold_index,
-        instruction=instruction,
+        instruction=query,
     )
 
 
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -7729,7 +7729,7 @@
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
     few_shots_split=None,
-    few_shots_select="random_sampling",
+    few_shots_select=None,
     generation_size=32_000,  # needed for reasoning models like R1
     metric=[Metrics.gpqa_instruct_metric],
     stop_sequence=[],  # no stop sequence, will use eos token
@@ -7745,7 +7745,7 @@
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
     few_shots_split=None,
-    few_shots_select="random_sampling",
+    few_shots_select=None,
     generation_size=32_000,  # needed for reasoning models like R1
     metric=[Metrics.gpqa_instruct_metric],
     stop_sequence=[],  # no stop sequence, will use eos token
@@ -7761,7 +7761,7 @@
     hf_avail_splits=["train"],
     evaluation_splits=["train"],
     few_shots_split=None,
-    few_shots_select="random_sampling",
+    few_shots_select=None,
     generation_size=32_000,  # needed for reasoning models like R1
     metric=[Metrics.gpqa_instruct_metric],
     stop_sequence=[],  # no stop sequence, will use eos token