Skip to content

Commit b5cbd91

Browse files
fix lighteval task inspect command and tiny bench task (#992)
* fix * revert uneeded changes --------- Co-authored-by: Clémentine Fourrier <[email protected]>
1 parent 5aa09c5 commit b5cbd91

File tree

4 files changed

+35
-35
lines changed

4 files changed

+35
-35
lines changed

src/lighteval/main_tasks.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
@app.command()
3535
def inspect(
3636
tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")],
37+
load_multilingual: Annotated[bool, Option(help="Whether to load multilingual tasks")] = False,
3738
custom_tasks: custom_tasks.type = custom_tasks.default,
3839
num_samples: Annotated[int, Option(help="Number of samples to display")] = 10,
3940
show_config: Annotated[bool, Option(help="Will display the full task config")] = False,
@@ -46,15 +47,15 @@ def inspect(
4647

4748
from lighteval.tasks.registry import Registry
4849

49-
registry = Registry(custom_tasks=custom_tasks, load_multilingual=True)
50+
registry = Registry(tasks=tasks, custom_tasks=custom_tasks, load_multilingual=load_multilingual)
5051

5152
# Loading task
5253
task_dict = registry.load_tasks()
5354
for name, task in task_dict.items():
5455
print("-" * 10, name, "-" * 10)
5556
if show_config:
5657
print("-" * 10, "CONFIG")
57-
task.cfg.print()
58+
task.config.print()
5859
for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
5960
if ix == 0:
6061
print("-" * 10, "SAMPLES")

src/lighteval/metrics/metrics_corpus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747

4848
class CorpusLevelComputation(ABC):
4949
@abstractmethod
50-
def compute_corpus(self):
50+
def compute_corpus(self, items):
5151
raise NotImplementedError
5252

5353
def __str__(self):

src/lighteval/tasks/tasks/ifbench/instructions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ def check_following(self, value):
790790
"""Checks if the response only includes words with prime length."""
791791
value = value.translate(str.maketrans("", "", string.punctuation))
792792
words = value.split()
793-
primes = set(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97)
793+
primes = {2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}
794794
for word in words:
795795
if len(word) not in primes:
796796
return False

src/lighteval/tasks/tasks/tiny_benchmarks/main.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,16 @@
2727

2828
import numpy as np
2929
import requests
30-
from aenum import extend_enum
3130
from scipy.optimize import minimize
3231

3332
import lighteval.tasks.default_prompts as prompt
34-
from lighteval.metrics.metrics import CorpusLevelMetricGrouping, Metrics
33+
from lighteval.metrics.metrics import CorpusLevelMetricGrouping
3534
from lighteval.metrics.metrics_corpus import CorpusLevelComputation
3635
from lighteval.metrics.metrics_sample import ExactMatches, LoglikelihoodAcc, SampleLevelComputation
3736
from lighteval.metrics.normalizations import gsm8k_normalizer
37+
from lighteval.models.model_output import ModelResponse
3838
from lighteval.tasks.lighteval_task import LightevalTaskConfig
39-
from lighteval.tasks.requests import SamplingMethod
39+
from lighteval.tasks.requests import Doc, SamplingMethod
4040

4141

4242
# Utility functions
@@ -96,18 +96,18 @@ def download(self):
9696
with open(path_dld, "wb") as file:
9797
file.write(response.content)
9898

99-
def compute(self, **args):
99+
def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
100100
if self.task == "gsm8k":
101101
res = ExactMatches(
102102
strip_strings=True, normalize_pred=gsm8k_normalizer, normalize_gold=gsm8k_normalizer
103-
).compute(**args)
103+
).compute(doc, model_response, **kwargs)
104104
return dict.fromkeys(self.METRICS, res)
105105
else:
106-
res = LoglikelihoodAcc().compute(**args)
106+
res = LoglikelihoodAcc().compute(doc, model_response, **kwargs)
107107
return dict.fromkeys(self.METRICS, res)
108108

109-
def compute_corpus(self, y_input):
110-
if len(y_input) == self.num_samples and self.estimates is not None:
109+
def compute_corpus(self, items):
110+
if len(items) == self.num_samples and self.estimates is not None:
111111
return self.estimates[self.task]
112112

113113
# We load the weights for the relevant examples
@@ -144,7 +144,7 @@ def compute_corpus(self, y_input):
144144
# Creating vector y and estimating theta
145145
y = np.zeros(N)
146146
for i, j in enumerate(seen_examples):
147-
y[j] = y_input[i]
147+
y[j] = items[i]
148148

149149
# Getting estimates
150150
theta = fit_theta(y, seen_examples, A, B)
@@ -170,7 +170,7 @@ def compute_corpus(self, y_input):
170170
estimates[scenario]["pirt"] = IRTp
171171
estimates[scenario]["gpirt"] = IRTpp
172172

173-
self.num_samples = len(y_input)
173+
self.num_samples = len(items)
174174
self.estimates = estimates
175175

176176
return estimates[self.task]
@@ -233,6 +233,25 @@ def compute_corpus(self, y_input):
233233
# },
234234
]
235235

236+
metrics = {}
237+
238+
for task_param in task_params:
239+
name = task_param["name"]
240+
if name == "gsm8k":
241+
category = SamplingMethod.GENERATIVE
242+
else:
243+
category = SamplingMethod.LOGPROBS
244+
245+
metrics[f"tinybench_metric_{name}"] = (
246+
CorpusLevelMetricGrouping(
247+
metric_name=TinyCorpusAggregator.METRICS,
248+
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
249+
sample_level_fn=TinyCorpusAggregator(name),
250+
category=category,
251+
corpus_level_fn=TinyCorpusAggregator(name),
252+
),
253+
)
254+
236255
TASKS_TABLE = []
237256
for task in task_params:
238257
name = task["name"]
@@ -251,28 +270,8 @@ def compute_corpus(self, y_input):
251270
evaluation_splits=task["evaluation_split"],
252271
few_shots_split=None,
253272
few_shots_select="random_sampling",
254-
metrics=[f"tinybench_metric_{name}"],
273+
metrics=metrics[f"tinybench_metric_{name}"],
255274
generation_size=generation_size,
256275
stop_sequence=stop_sequence,
257276
)
258277
TASKS_TABLE.append(task)
259-
260-
# CUSTOM METRIC
261-
for task_param in task_params:
262-
name = task_param["name"]
263-
if name == "gsm8k":
264-
category = SamplingMethod.GENERATIVE
265-
else:
266-
category = SamplingMethod.LOGPROBS
267-
268-
extend_enum(
269-
Metrics,
270-
f"tinybench_metric_{name}",
271-
CorpusLevelMetricGrouping(
272-
metric_name=TinyCorpusAggregator.METRICS,
273-
higher_is_better=dict.fromkeys(TinyCorpusAggregator.METRICS, True),
274-
sample_level_fn=TinyCorpusAggregator(name),
275-
category=category,
276-
corpus_level_fn=TinyCorpusAggregator(name),
277-
),
278-
)

0 commit comments

Comments
 (0)