Skip to content

Commit 3a71f68

Browse files
authored
Add IFBench (#944)
* init, wip * unrelated but these tasks were buggy * better suite management: we don't load all optional deps all the time * upgrade * singleton + transformer sampling fix in config * incredible how much code was just pulled from ifeval * fix test 1 * fix test 2 * fix tests part 1 - also removes fewshot truncation in the task name because it's no longer used anywhere in the code logically * fix registry mockup * fixed last tests
1 parent 9ba430f commit 3a71f68

File tree

6 files changed

+2765
-1
lines changed

6 files changed

+2765
-1
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ extended_tasks = [
106106
"langdetect", # ifeval
107107
"openai>1.87", # llm as a judge using openai models
108108
"tiktoken",
109+
"emoji", "spacy", "syllapy" # ifbench
109110
]
110111
s3 = ["s3fs"]
111112
multilingual = [

src/lighteval/tasks/extended/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,15 @@
2525

2626
if can_load_extended_tasks():
2727
import lighteval.tasks.extended.hle.main as hle
28+
import lighteval.tasks.extended.ifbench.main as ifbench
2829
import lighteval.tasks.extended.ifeval.main as ifeval
2930
import lighteval.tasks.extended.lcb.main as lcb
3031
import lighteval.tasks.extended.mix_eval.main as mix_eval
3132
import lighteval.tasks.extended.mt_bench.main as mt_bench
3233
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
3334
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
3435

35-
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]
36+
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, ifbench, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]
3637

3738
else:
3839
AVAILABLE_EXTENDED_TASKS_MODULES = []
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
# coding=utf-8
2+
# Copyright 2025 The Google Research Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Binary of evaluating instruction following. See README.md."""
17+
18+
import collections
19+
import dataclasses
20+
import json
21+
from typing import Dict, Optional, Union
22+
23+
import lighteval.tasks.extended.ifbench.instructions_registry as instructions_registry
24+
25+
26+
@dataclasses.dataclass
27+
class InputExample:
28+
key: int
29+
instruction_id_list: list[str]
30+
prompt: str
31+
kwargs: list[Dict[str, Optional[Union[str, int]]]]
32+
33+
34+
@dataclasses.dataclass
35+
class OutputExample:
36+
instruction_id_list: list[str]
37+
prompt: str
38+
response: str
39+
follow_all_instructions: bool
40+
follow_instruction_list: list[bool]
41+
42+
43+
def read_prompt_list(input_jsonl_filename):
44+
"""Read inputs from jsonl."""
45+
inputs = []
46+
with open(input_jsonl_filename, "r") as f:
47+
for line in f:
48+
example = json.loads(line)
49+
inputs.append(
50+
InputExample(
51+
key=example["key"],
52+
instruction_id_list=example["instruction_id_list"],
53+
prompt=example["prompt"],
54+
kwargs=example["kwargs"],
55+
)
56+
)
57+
return inputs
58+
59+
60+
def write_outputs(output_jsonl_filename, outputs):
61+
"""Writes outputs to jsonl."""
62+
assert outputs
63+
with open(output_jsonl_filename, "w") as f:
64+
for o in outputs:
65+
f.write(
66+
json.dumps(
67+
{
68+
attr_name: o.__getattribute__(attr_name)
69+
for attr_name in [name for name in dir(o) if not name.startswith("_")]
70+
}
71+
)
72+
)
73+
f.write("\n")
74+
75+
76+
def test_instruction_following_strict(
77+
inp,
78+
prompt_to_response,
79+
):
80+
"""Tests response to see if instrutions are followed."""
81+
response = prompt_to_response[inp.prompt]
82+
instruction_list = inp.instruction_id_list
83+
is_following_list = []
84+
85+
for index, instruction_id in enumerate(instruction_list):
86+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
87+
instruction = instruction_cls(instruction_id)
88+
inp.kwargs[index] = {key: value for key, value in inp.kwargs[index].items() if value is not None}
89+
instruction.build_description(**inp.kwargs[index])
90+
args = instruction.get_instruction_args()
91+
if args and "prompt" in args:
92+
instruction.build_description(prompt=inp.prompt)
93+
94+
if response.strip() and instruction.check_following(response):
95+
is_following_list.append(True)
96+
else:
97+
is_following_list.append(False)
98+
99+
return OutputExample(
100+
instruction_id_list=inp.instruction_id_list,
101+
prompt=inp.prompt,
102+
response=response,
103+
follow_all_instructions=all(is_following_list),
104+
follow_instruction_list=is_following_list,
105+
)
106+
107+
108+
def test_instruction_following_loose(
109+
inp,
110+
prompt_to_response,
111+
):
112+
"""Tests response for an upper bound for following instructions."""
113+
response = prompt_to_response[inp.prompt]
114+
r = response.split("\n")
115+
response_remove_first = "\n".join(r[1:]).strip()
116+
response_remove_last = "\n".join(r[:-1]).strip()
117+
response_remove_both = "\n".join(r[1:-1]).strip()
118+
revised_response = response.replace("*", "")
119+
revised_response_remove_first = response_remove_first.replace("*", "")
120+
revised_response_remove_last = response_remove_last.replace("*", "")
121+
revised_response_remove_both = response_remove_both.replace("*", "")
122+
all_responses = [
123+
response,
124+
revised_response,
125+
response_remove_first,
126+
response_remove_last,
127+
response_remove_both,
128+
revised_response_remove_first,
129+
revised_response_remove_last,
130+
revised_response_remove_both,
131+
]
132+
instruction_list = inp.instruction_id_list
133+
is_following_list = []
134+
135+
for index, instruction_id in enumerate(instruction_list):
136+
instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
137+
instruction = instruction_cls(instruction_id)
138+
139+
instruction.build_description(**inp.kwargs[index])
140+
args = instruction.get_instruction_args()
141+
if args and "prompt" in args:
142+
instruction.build_description(prompt=inp.prompt)
143+
144+
is_following = False
145+
for r in all_responses:
146+
if r.strip() and instruction.check_following(r):
147+
is_following = True
148+
break
149+
150+
is_following_list.append(is_following)
151+
152+
return OutputExample(
153+
instruction_id_list=inp.instruction_id_list,
154+
prompt=inp.prompt,
155+
response=response,
156+
follow_all_instructions=all(is_following_list),
157+
follow_instruction_list=is_following_list,
158+
)
159+
160+
161+
def read_prompt_to_response_dict(input_jsonl_filename):
162+
"""Creates dictionary matching prompt and response."""
163+
return_dict = {}
164+
with open(input_jsonl_filename, "r") as file:
165+
for line in file:
166+
example = json.loads(line)
167+
return_dict[example["prompt"]] = example["response"]
168+
return return_dict
169+
170+
171+
def print_report(outputs):
172+
"""Prints a report on accuracy scores."""
173+
174+
prompt_total = 0
175+
prompt_correct = 0
176+
instruction_total = 0
177+
instruction_correct = 0
178+
179+
tier0_total = collections.defaultdict(int)
180+
tier0_correct = collections.defaultdict(int)
181+
182+
tier1_total = collections.defaultdict(int)
183+
tier1_correct = collections.defaultdict(int)
184+
185+
for example in outputs:
186+
follow_instruction_list = example.follow_instruction_list
187+
instruction_id_list = example.instruction_id_list
188+
189+
prompt_total += 1
190+
if all(follow_instruction_list):
191+
prompt_correct += 1
192+
193+
instruction_total += len(instruction_id_list)
194+
instruction_correct += sum(follow_instruction_list)
195+
196+
for instruction_id, followed_or_not in zip(instruction_id_list, follow_instruction_list):
197+
instruction_id = instruction_id.split(":")[0]
198+
tier0_total[instruction_id] += 1
199+
if followed_or_not:
200+
tier0_correct[instruction_id] += 1
201+
202+
for instruction_id, followed_or_not in zip(instruction_id_list, follow_instruction_list):
203+
tier1_total[instruction_id] += 1
204+
if followed_or_not:
205+
tier1_correct[instruction_id] += 1
206+
207+
print(f"prompt-level: {prompt_correct / prompt_total}")
208+
print(f"instruction-level: {instruction_correct / instruction_total}")
209+
print()
210+
for instruction_id in sorted(tier0_total.keys()):
211+
accuracy = tier0_correct[instruction_id] / tier0_total[instruction_id]
212+
print(f"{instruction_id} {accuracy}")
213+
print()
214+
for instruction_id in sorted(tier1_total.keys()):
215+
accuracy = tier1_correct[instruction_id] / tier1_total[instruction_id]
216+
print(f"{instruction_id} {accuracy}")

0 commit comments

Comments
 (0)