Skip to content

Commit d77b932

Browse files
authored
[Evals] Add support for LiveAoPS dataset (#101)
* add liveaops Signed-off-by: SumanthRH <[email protected]> * fixes Signed-off-by: SumanthRH <[email protected]> * lint Signed-off-by: SumanthRH <[email protected]> --------- Signed-off-by: SumanthRH <[email protected]>
1 parent ca01546 commit d77b932

File tree

5 files changed

+61
-8
lines changed

5 files changed

+61
-8
lines changed

skythought/evals/tasks/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .base import ConversationType, TaskConfig, TaskHandler
88
from .gpqa_diamond.gpqa_diamond_handler import GPQADiamondTaskHandler
99
from .gsm8k.gsm8k_handler import GSM8KTaskHandler
10+
from .liveaops.liveaops_handler import LiveAOPSTaskHandler
1011
from .livecodebench.livecodebench_handler import LiveCodeBenchTaskHandler
1112
from .math.math_handler import MathTaskHandler
1213
from .minervamath.minervamath_handler import MinervaMathTaskHandler
@@ -33,6 +34,7 @@
3334
"minervamath": MinervaMathTaskHandler,
3435
"olympiadbench_math": OlympiadBenchMathTaskHandler,
3536
"omni_math": OMNIMathTaskHandler,
37+
"liveaops": LiveAOPSTaskHandler,
3638
}
3739
TASK_NAMES_TO_YAML = get_tasks(os.path.dirname(__file__))
3840

skythought/evals/tasks/base.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from abc import ABC, abstractmethod
22
from typing import Any, Dict, List, Optional
3+
from urllib.parse import urlparse
34

45
import pandas as pd
56
import yaml
@@ -14,7 +15,7 @@ class TaskConfig(BaseModel):
1415
handler: str
1516
dataset_path: str
1617
dataset_subset: Optional[str] = None
17-
dataset_split: str
18+
dataset_split: Optional[str] = None
1819
dataset_kwargs: Dict[str, Any] = Field(default_factory=dict)
1920
question_key: str
2021
# Optional answer key for datasets with a single correct answer
@@ -82,12 +83,28 @@ def make_conversations(
8283
return conversations
8384

8485
def load_dataset(self, subset=None, split=None, **kwargs) -> HFDataset:
85-
dataset = load_dataset(
86-
path=self.task_config.dataset_path,
87-
name=subset if subset else self.task_config.dataset_subset,
88-
split=split if split else self.task_config.dataset_split,
89-
**self.task_config.dataset_kwargs,
90-
)
86+
# check if the path provided is a valid URL
87+
parsed = urlparse(self.task_config.dataset_path)
88+
if not parsed.scheme:
89+
# HF dataset
90+
dataset = load_dataset(
91+
path=self.task_config.dataset_path,
92+
name=subset if subset else self.task_config.dataset_subset,
93+
split=split if split else self.task_config.dataset_split,
94+
**self.task_config.dataset_kwargs,
95+
)
96+
else:
97+
# Try to load URL
98+
# Only JSON supported for now
99+
if split is not None or subset is not None:
100+
raise ValueError(
101+
"URL-based dataset does not support loading arguments like `split`, `subset`"
102+
)
103+
# By default, Huggingface will create a DatasetDict object with "train" split
104+
dataset = load_dataset("json", data_files=[self.task_config.dataset_path])[
105+
"train"
106+
]
107+
91108
# add an index column efficiently with map
92109
dataset = dataset.map(add_idx_map, with_indices=True)
93110
return dataset
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
handler: liveaops
2+
dataset_path: https://livemathbench.github.io/data/LiveAoPSBench-2024.jsonl
3+
dataset_subset: null # which subset on huggingface. Not applicable for a URL dataset
4+
dataset_split: null # Rule based evaluation
5+
question_key: question
6+
answer_key: answer
7+
templating_parameters:
8+
template: "Return your final response within \\boxed{{}}. {question}"
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from skythought.evals.util.math_parsing_util import (
2+
extract_answer,
3+
math_equal,
4+
strip_answer_string,
5+
)
6+
7+
from ..math.math_handler import MathTaskHandler
8+
9+
10+
class LiveAOPSTaskHandler(MathTaskHandler):
11+
def generate_prompt(self, problem):
12+
return self.task_config.templating_parameters["template"].format(**problem)
13+
14+
def check_correctness(self, problem, generation):
15+
# no preprocessing needed
16+
answer = problem[self.task_config.answer_key]
17+
pred = extract_answer(generation)
18+
pred = strip_answer_string(pred)
19+
return math_equal(pred, answer)
20+
21+
def load_and_filter_dataset(
22+
self, start, end, split=None, subset=None, difficulty=None
23+
):
24+
assert difficulty is None, "LiveAOPS does not support `difficulty` argument"
25+
dataset = self.load_dataset(subset=subset, split=split).to_pandas()
26+
return dataset.iloc[start:end] if end > 0 else dataset.iloc[start:]

skythought/evals/tasks/omni_math/omni_math.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
handler: math
1+
handler: omni_math
22
dataset_path: "KbsdJames/Omni-MATH" # repo ID in huggingface
33
dataset_subset: null # which subset on huggingface
44
dataset_split: test_rule_based # Rule based evaluation

0 commit comments

Comments
 (0)