Skip to content

Commit cabef7c

Browse files
authored
Fixes wikitext prompts + some patches on tg models (#64)
- fix prompt for helm wikitext - fix perplexity metric to fit the harness - fix small bug in tgi models - allow to skip docs if empty - make each wikitext task more explicit in what it's doing
1 parent acffc1a commit cabef7c

File tree

9 files changed

+35
-23
lines changed

9 files changed

+35
-23
lines changed

src/lighteval/logging/info_loggers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import xxhash
1010

11-
from lighteval.logging.hierarchical_logger import hlog, hlog_warn
11+
from lighteval.logging.hierarchical_logger import hlog_warn
1212
from lighteval.metrics import MetricCategory
1313
from lighteval.metrics.stderr import get_stderr_function
1414
from lighteval.models.model_loader import ModelInfo
@@ -440,7 +440,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
440440
try:
441441
metric_result = task.aggregation()[metric_name](metric_values)
442442
except OverflowError:
443-
hlog(f"{task_name} {metric_name} OVERFLOW ERROR")
443+
hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.")
444444
metric_result = float("nan")
445445

446446
if isinstance(metric_result, dict): # in which cases do we get a dict here?

src/lighteval/metrics/metrics_corpus.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,6 @@ def compute(self, items: list[PerplexityCorpusMetricInput]):
111111
if self.metric_type == "perplexity":
112112
return math.exp(-np.mean(logprobs))
113113
if self.metric_type == "weighted_perplexity":
114-
return math.exp(-np.average(logprobs, weights=weights))
114+
return math.exp(-sum(logprobs) / sum(weights))
115115
if self.metric_type == "bits_per_byte":
116-
return -np.average(logprobs, weights=weights) / math.log(2)
116+
return -sum(logprobs) / sum(weights) * 1 / math.log(2)

src/lighteval/models/endpoint_model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,11 +288,10 @@ def loglikelihood(
288288
responses = self.__process_batch_logprob(batch)
289289
for ix, response in enumerate(responses):
290290
len_choice = len(batch[ix].tokenized_continuation)
291+
logits = [t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None]
291292
results.append(
292293
LoglikelihoodReturn(
293-
result=[
294-
t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None
295-
],
294+
result=sum(logits),
296295
input_tokens=[t.id for t in response.details.prefill[:-len_choice]],
297296
generated_tokens=[t.id for t in response.details.prefill[-len_choice:]],
298297
truncated_tokens_count=-1,
@@ -329,9 +328,10 @@ def loglikelihood_rolling(
329328
else:
330329
responses = self.__process_batch_logprob(batch, rolling=True)
331330
for response in responses:
331+
logits = [t.logprob for t in response.details.tokens[:-1]]
332332
results.append(
333333
LoglikelihoodReturn(
334-
result=[t.logprob for t in response.details.tokens[:-1]],
334+
result=sum(logits),
335335
input_tokens=[t.id for t in response.details.prefill],
336336
generated_tokens=[t.id for t in response.details.tokens[:-1]],
337337
truncated_tokens_count=-1,

src/lighteval/models/model_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
@dataclass
8-
class ModelReturn: # @clefourrier: could probably an abstract class, but it might make the code too complex
8+
class ModelReturn:
99
result: Union[tuple, list, str]
1010
input_tokens: list[int] = field(default_factory=list) # model inputs
1111
generated_tokens: list[int] = field(default_factory=list) # model generations

src/lighteval/tasks/lighteval_task.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,10 @@ def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
261261
# vs when it's used for the actual prompt. That's why we store whether we are currently using the
262262
# doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc.
263263
item["__few_shots"] = few_shots
264-
docs.extend(as_list(self.formatter(item, self.name)))
264+
cur_docs = self.formatter(item, self.name)
265+
if cur_docs is None:
266+
continue
267+
docs.extend(as_list(cur_docs))
265268
return docs
266269

267270
def fewshot_docs(self) -> list[Doc]:
@@ -375,7 +378,9 @@ def construct_requests(
375378
]
376379
if self.has_metric_category[MetricCategory.PERPLEXITY]:
377380
requests[RequestType.LOGLIKELIHOOD_ROLLING] += [
378-
LoglikelihoodRollingRequest(task_name=current_task_name, doc_id=document_id_seed, ctx=context)
381+
LoglikelihoodRollingRequest(
382+
task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context
383+
)
379384
]
380385
if self.has_metric_category[MetricCategory.GENERATIVE]:
381386
requests[RequestType.GREEDY_UNTIL] += [

src/lighteval/tasks/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
# original is the reimplementation of original evals
1717
# custom is to play around
18-
DEFAULT_SUITES = ["helm", "bigbench", "lighteval", "original", "custom", "community"]
18+
DEFAULT_SUITES = ["helm", "bigbench", "harness", "lighteval", "original", "custom", "community"]
1919

2020
TRUNCATE_FEW_SHOTS_DEFAULTS = True
2121

src/lighteval/tasks/tasks_prompt_formatting.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2017,7 +2017,17 @@ def wic(line, task_name: str = None):
20172017
)
20182018

20192019

2020-
def wikitext(line, task_name: str = None): # perplexity metric
2020+
def wikifact(line, task_name: str = None):
2021+
return Doc(task_name=task_name, query=f"{line['question']} ", gold_index=0, choices=[line["references"]])
2022+
2023+
2024+
def wikitext(line, task_name: str = None):
2025+
if line["text"] == "" or line["text"][0] == "=":
2026+
return None
2027+
return Doc(task_name=task_name, query=f"{line['text']} ", gold_index=0, choices=None)
2028+
2029+
2030+
def wikitext_harness(line, task_name: str = None): # perplexity metric
20212031
def wikitext_detokenizer(cur_string):
20222032
# contractions
20232033
cur_string = cur_string.replace("s '", "s'")
@@ -2060,12 +2070,8 @@ def wikitext_detokenizer(cur_string):
20602070
)
20612071

20622072

2063-
def wikifact(line, task_name: str = None):
2064-
return Doc(task_name=task_name, query=f"{line['question']} ", gold_index=0, choices=[line["references"]])
2065-
2066-
2067-
def wikitext_103(line, task_name: str = None):
2068-
return Doc(task_name=task_name, query=line["text"])
2073+
def wikitext_helm(line, task_name: str = None):
2074+
return Doc(task_name=task_name, choices=[""], gold_index=0, query=line["page"])
20692075

20702076

20712077
def winogrande(line, task_name: str = None):

src/lighteval/tasks/tasks_table.jsonl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -939,8 +939,9 @@
939939
{"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
940940
{"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
941941
{"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
942-
{"name":"wikitext","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
943-
{"name":"wikitext:103","suite":["helm"],"prompt_function":"wikitext_103","hf_repo":"lighteval\/wikitext_103","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
942+
{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
943+
{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
944+
{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
944945
{"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
945946
{"name":"winogrande","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
946947
{"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:65d1d1bfb86c291b80e4117845385c6af0d965dc0af5e7a05774c820b772ba98
3-
size 20246116
2+
oid sha256:02a5551e1137c799c9a1535112d221c7a77fd07b72c2b38b640164be7ea70828
3+
size 20246141

0 commit comments

Comments
 (0)