Skip to content

Commit 1ce7331

Browse files
authored
Add GPQA for instruct models (#534)
* Add GPQA for instruct models * Add ref * Refactor * Tune prompt * Tune max tokens * Use simple-eval template
1 parent cb35bea commit 1ce7331

File tree

4 files changed

+78
-1
lines changed

4 files changed

+78
-1
lines changed

src/lighteval/logging/evaluation_tracker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,13 @@ def push_to_hub(
325325
# We upload it both as a json and a parquet file
326326
result_file_base_name = f"results_{date_id}"
327327
results_json = json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)
328-
self.api.upload_file(
328+
url = self.api.upload_file(
329329
repo_id=repo_id,
330330
path_or_fileobj=BytesIO(results_json.encode("utf-8")),
331331
path_in_repo=f"{result_file_base_name}.json",
332332
repo_type="dataset",
333333
)
334+
logger.info(f"Uploaded evaluation details to {url}")
334335

335336
results_dataset = Dataset.from_dict(
336337
{key: [json.dumps(v, cls=EnhancedJSONEncoder, indent=2)] for key, v in results_dict.items()}

src/lighteval/metrics/metrics.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@
2424
import numpy as np
2525
from aenum import Enum
2626

27+
from lighteval.metrics.dynamic_metrics import (
28+
IndicesExtractionConfig,
29+
multilingual_extractive_match_metric,
30+
)
2731
from lighteval.metrics.harness_compatibility.drop import drop_metrics
2832
from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
2933
from lighteval.metrics.metrics_corpus import (
@@ -69,6 +73,7 @@
6973
SampleLevelMetric,
7074
SampleLevelMetricGrouping,
7175
)
76+
from lighteval.utils.language import Language
7277
from lighteval.utils.utils import as_list
7378

7479

@@ -549,6 +554,12 @@ class Metrics(Enum):
549554
corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute,
550555
higher_is_better=False,
551556
)
557+
gpqa_instruct_metric = multilingual_extractive_match_metric(
558+
language=Language.ENGLISH,
559+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
560+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
561+
precision=6,
562+
)
552563

553564
def __str__(self):
554565
return self.name.replace("_at_", "@")

src/lighteval/tasks/default_prompts.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,23 @@ def gpqa(line, task_name: str = None):
729729
)
730730

731731

732+
def gpqa_instruct(line, task_name: str = None):
733+
"""Prompt template adapted from simple-evals: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
734+
gold_index = random.randint(0, 3)
735+
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
736+
choices.insert(gold_index, line["Correct Answer"])
737+
query_template = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{Question}\n\nA) {A}\nB) {B}\nC) {C}\nD) {D}"
738+
query = query_template.format(A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=line["Question"])
739+
740+
return Doc(
741+
task_name=task_name,
742+
query=query,
743+
choices=LETTER_INDICES[: len(choices)],
744+
gold_index=gold_index,
745+
instruction=query,
746+
)
747+
748+
732749
def gsm8k(line, task_name: str = None):
733750
# Has special analysis in metric for number decomposition
734751
return Doc(

src/lighteval/tasks/default_tasks.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7720,6 +7720,54 @@
77207720
trust_dataset=True,
77217721
version=0,
77227722
)
7723+
gpqa_diamond_instruct_lighteval = LightevalTaskConfig(
7724+
name="gpqa:diamond",
7725+
suite=["lighteval"],
7726+
prompt_function=prompt.gpqa_instruct,
7727+
hf_repo="Idavidrein/gpqa",
7728+
hf_subset="gpqa_diamond",
7729+
hf_avail_splits=["train"],
7730+
evaluation_splits=["train"],
7731+
few_shots_split=None,
7732+
few_shots_select=None,
7733+
generation_size=32768, # needed for reasoning models like R1
7734+
metric=[Metrics.gpqa_instruct_metric],
7735+
stop_sequence=[], # no stop sequence, will use eos token
7736+
trust_dataset=True,
7737+
version=0,
7738+
)
7739+
gpqa_extended_instruct_lighteval = LightevalTaskConfig(
7740+
name="gpqa:extended",
7741+
suite=["lighteval"],
7742+
prompt_function=prompt.gpqa_instruct,
7743+
hf_repo="Idavidrein/gpqa",
7744+
hf_subset="gpqa_extended",
7745+
hf_avail_splits=["train"],
7746+
evaluation_splits=["train"],
7747+
few_shots_split=None,
7748+
few_shots_select=None,
7749+
generation_size=32768, # needed for reasoning models like R1
7750+
metric=[Metrics.gpqa_instruct_metric],
7751+
stop_sequence=[], # no stop sequence, will use eos token
7752+
trust_dataset=True,
7753+
version=0,
7754+
)
7755+
gpqa_main_instruct_lighteval = LightevalTaskConfig(
7756+
name="gpqa:main",
7757+
suite=["lighteval"],
7758+
prompt_function=prompt.gpqa_instruct,
7759+
hf_repo="Idavidrein/gpqa",
7760+
hf_subset="gpqa_main",
7761+
hf_avail_splits=["train"],
7762+
evaluation_splits=["train"],
7763+
few_shots_split=None,
7764+
few_shots_select=None,
7765+
generation_size=32768, # needed for reasoning models like R1
7766+
metric=[Metrics.gpqa_instruct_metric],
7767+
stop_sequence=[], # no stop sequence, will use eos token
7768+
trust_dataset=True,
7769+
version=0,
7770+
)
77237771
gre_reading_comprehension_bigbench = LightevalTaskConfig(
77247772
name="gre_reading_comprehension",
77257773
suite=["bigbench", "bigbench_json"],

0 commit comments

Comments
 (0)