Skip to content
Merged

bbeh #1124

Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lighteval/main_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ def eval( # noqa C901
internal_tools=internal_tools,
bundle_dir=bundle_dir,
bundle_overwrite=bundle_overwrite,
log_level="debug",
)

if not success:
Expand Down
238 changes: 238 additions & 0 deletions src/lighteval/tasks/tasks/bigbench_extra_hard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
"""
name:
BIG-Bench Extra Hard

dataset:
jgyasu/bbeh

abstract:
BIG-Bench Extra Hard (BBEH) is a successor to BIG-Bench Hard (BBH), created to evaluate large
language models on substantially more difficult general-reasoning tasks. Each BBH task is replaced
with a new task targeting the same underlying reasoning skill but at a significantly higher difficulty.

languages:
english

tags:
reasoning

paper:
https://arxiv.org/abs/2502.19187
"""

from inspect_ai.dataset import Sample
from inspect_ai.scorer import answer
from inspect_ai.solver import generate, system_message

from lighteval.metrics.metrics import Metrics
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc


def bbeh_prompt(line, task_name: str = None):
line = {k: v for k, v in line.items() if v is not None}

query = "Question: \n"
query += line["input"]
query += "\nAnswer:"

return Doc(
task_name=task_name,
query=query,
choices=line["target"],
gold_index=0,
instruction="",
)


def record_to_sample(record):
query = f"{record['input']}"
target = record["target"]

return Sample(input=query, target=target)


SYSTEM_MESSAGE = """Submit your answer in the following format:
ANSWER: {your answer}
"""


COMMON_TASK_ARGS = {
"prompt_function": bbeh_prompt,
"hf_repo": "jgyasu/bbeh",
"hf_avail_splits": ["train"],
"evaluation_splits": ["train"],
"few_shots_split": None,
"few_shots_select": None,
"generation_size": -1,
"metrics": [Metrics.loglikelihood_acc],
"stop_sequence": ["</s>", "Q=", "\n\n"],
"version": 0,
"sample_fields": record_to_sample,
"solver": [system_message(SYSTEM_MESSAGE), generate(cache=True)],
"scorer": answer(pattern="line"),
}

boardgame_qa = LightevalTaskConfig(
name="bigbench_extra_hard:boardgame_qa",
hf_subset="boardgame_qa",
**COMMON_TASK_ARGS,
)

boolean_expressions = LightevalTaskConfig(
name="bigbench_extra_hard:boolean_expressions",
hf_subset="boolean_expressions",
**COMMON_TASK_ARGS,
)

buggy_tables = LightevalTaskConfig(
name="bigbench_extra_hard:buggy_tables",
hf_subset="buggy_tables",
**COMMON_TASK_ARGS,
)

causal_understanding = LightevalTaskConfig(
name="bigbench_extra_hard:causal_understanding",
hf_subset="causal_understanding",
**COMMON_TASK_ARGS,
)

disambiguation_qa = LightevalTaskConfig(
name="bigbench_extra_hard:disambiguation_qa",
hf_subset="disambiguation_qa",
**COMMON_TASK_ARGS,
)

dyck_languages = LightevalTaskConfig(
name="bigbench_extra_hard:dyck_languages",
hf_subset="dyck_languages",
**COMMON_TASK_ARGS,
)

geometric_shapes = LightevalTaskConfig(
name="bigbench_extra_hard:geometric_shapes",
hf_subset="geometric_shapes",
**COMMON_TASK_ARGS,
)

hyperbaton = LightevalTaskConfig(
name="bigbench_extra_hard:hyperbaton",
hf_subset="hyperbaton",
**COMMON_TASK_ARGS,
)

linguini = LightevalTaskConfig(
name="bigbench_extra_hard:linguini",
hf_subset="linguini",
**COMMON_TASK_ARGS,
)

movie_recommendation = LightevalTaskConfig(
name="bigbench_extra_hard:movie_recommendation",
hf_subset="movie_recommendation",
**COMMON_TASK_ARGS,
)

multistep_arithmetic = LightevalTaskConfig(
name="bigbench_extra_hard:multistep_arithmetic",
hf_subset="multistep_arithmetic",
**COMMON_TASK_ARGS,
)

nycc = LightevalTaskConfig(
name="bigbench_extra_hard:nycc",
hf_subset="nycc",
**COMMON_TASK_ARGS,
)

object_counting = LightevalTaskConfig(
name="bigbench_extra_hard:object_counting",
hf_subset="object_counting",
**COMMON_TASK_ARGS,
)

object_properties = LightevalTaskConfig(
name="bigbench_extra_hard:object_properties",
hf_subset="object_properties",
**COMMON_TASK_ARGS,
)

sarc_triples = LightevalTaskConfig(
name="bigbench_extra_hard:sarc_triples",
hf_subset="sarc_triples",
**COMMON_TASK_ARGS,
)

shuffled_objects = LightevalTaskConfig(
name="bigbench_extra_hard:shuffled_objects",
hf_subset="shuffled_objects",
**COMMON_TASK_ARGS,
)

spatial_reasoning = LightevalTaskConfig(
name="bigbench_extra_hard:spatial_reasoning",
hf_subset="spatial_reasoning",
**COMMON_TASK_ARGS,
)

sportqa = LightevalTaskConfig(
name="bigbench_extra_hard:sportqa",
hf_subset="sportqa",
**COMMON_TASK_ARGS,
)

temporal_sequence = LightevalTaskConfig(
name="bigbench_extra_hard:temporal_sequence",
hf_subset="temporal_sequence",
**COMMON_TASK_ARGS,
)

time_arithmetic = LightevalTaskConfig(
name="bigbench_extra_hard:time_arithmetic",
hf_subset="time_arithmetic",
**COMMON_TASK_ARGS,
)

web_of_lies = LightevalTaskConfig(
name="bigbench_extra_hard:web_of_lies",
hf_subset="web_of_lies",
**COMMON_TASK_ARGS,
)

word_sorting = LightevalTaskConfig(
name="bigbench_extra_hard:word_sorting",
hf_subset="word_sorting",
**COMMON_TASK_ARGS,
)

zebra_puzzles = LightevalTaskConfig(
name="bigbench_extra_hard:zebra_puzzles",
hf_subset="zebra_puzzles",
**COMMON_TASK_ARGS,
)

TASKS_TABLE = [
boardgame_qa,
boolean_expressions,
buggy_tables,
causal_understanding,
disambiguation_qa,
dyck_languages,
geometric_shapes,
hyperbaton,
linguini,
movie_recommendation,
multistep_arithmetic,
nycc,
object_counting,
object_properties,
sarc_triples,
shuffled_objects,
spatial_reasoning,
sportqa,
temporal_sequence,
time_arithmetic,
web_of_lies,
word_sorting,
zebra_puzzles,
]
Loading