Fixes wikitext prompts + some patches on tg models (#64)

clefourrier · web-flow · commit cabef7c414e7 · 2024-02-28T19:19:20.000+01:00
- fix prompt for helm wikitext
- fix perplexity metric to fit the harness
- fix small bug in tgi models
- allow to skip docs if empty
- make each wikitext task more explicit in what it's doing
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -8,7 +8,7 @@
 import numpy as np
 import xxhash
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
+from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics import MetricCategory
 from lighteval.metrics.stderr import get_stderr_function
 from lighteval.models.model_loader import ModelInfo
@@ -440,7 +440,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
                 try:
                     metric_result = task.aggregation()[metric_name](metric_values)
                 except OverflowError:
-                    hlog(f"{task_name} {metric_name} OVERFLOW ERROR")
+                    hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.")
                     metric_result = float("nan")
 
                 if isinstance(metric_result, dict):  # in which cases do we get a dict here?
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
@@ -111,6 +111,6 @@ def compute(self, items: list[PerplexityCorpusMetricInput]):
         if self.metric_type == "perplexity":
             return math.exp(-np.mean(logprobs))
         if self.metric_type == "weighted_perplexity":
-            return math.exp(-np.average(logprobs, weights=weights))
+            return math.exp(-sum(logprobs) / sum(weights))
         if self.metric_type == "bits_per_byte":
-            return -np.average(logprobs, weights=weights) / math.log(2)
+            return -sum(logprobs) / sum(weights) * 1 / math.log(2)
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
@@ -288,11 +288,10 @@ def loglikelihood(
                     responses = self.__process_batch_logprob(batch)
                 for ix, response in enumerate(responses):
                     len_choice = len(batch[ix].tokenized_continuation)
+                    logits = [t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None]
                     results.append(
                         LoglikelihoodReturn(
-                            result=[
-                                t.logprob for t in response.details.prefill[-len_choice:] if t.logprob is not None
-                            ],
+                            result=sum(logits),
                             input_tokens=[t.id for t in response.details.prefill[:-len_choice]],
                             generated_tokens=[t.id for t in response.details.prefill[-len_choice:]],
                             truncated_tokens_count=-1,
@@ -329,9 +328,10 @@ def loglikelihood_rolling(
                 else:
                     responses = self.__process_batch_logprob(batch, rolling=True)
                 for response in responses:
+                    logits = [t.logprob for t in response.details.tokens[:-1]]
                     results.append(
                         LoglikelihoodReturn(
-                            result=[t.logprob for t in response.details.tokens[:-1]],
+                            result=sum(logits),
                             input_tokens=[t.id for t in response.details.prefill],
                             generated_tokens=[t.id for t in response.details.tokens[:-1]],
                             truncated_tokens_count=-1,
diff --git a/src/lighteval/models/model_output.py b/src/lighteval/models/model_output.py
@@ -5,7 +5,7 @@
 
 
 @dataclass
-class ModelReturn:  # @clefourrier: could probably an abstract class, but it might make the code too complex
+class ModelReturn:
     result: Union[tuple, list, str]
     input_tokens: list[int] = field(default_factory=list)  # model inputs
     generated_tokens: list[int] = field(default_factory=list)  # model generations
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -261,7 +261,10 @@ def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
                 # vs when it's used for the actual prompt. That's why we store whether we are currently using the
                 # doc for a fewshot sample (few_shots=True) or not, which then leads to the creation of a different Doc.
                 item["__few_shots"] = few_shots
-                docs.extend(as_list(self.formatter(item, self.name)))
+                cur_docs = self.formatter(item, self.name)
+                if cur_docs is None:
+                    continue
+                docs.extend(as_list(cur_docs))
         return docs
 
     def fewshot_docs(self) -> list[Doc]:
@@ -375,7 +378,9 @@ def construct_requests(
             ]
         if self.has_metric_category[MetricCategory.PERPLEXITY]:
             requests[RequestType.LOGLIKELIHOOD_ROLLING] += [
-                LoglikelihoodRollingRequest(task_name=current_task_name, doc_id=document_id_seed, ctx=context)
+                LoglikelihoodRollingRequest(
+                    task_name=current_task_name, example_index=document_id_seed, request_index=0, context=context
+                )
             ]
         if self.has_metric_category[MetricCategory.GENERATIVE]:
             requests[RequestType.GREEDY_UNTIL] += [
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
@@ -15,7 +15,7 @@
 
 # original is the reimplementation of original evals
 # custom is to play around
-DEFAULT_SUITES = ["helm", "bigbench", "lighteval", "original", "custom", "community"]
+DEFAULT_SUITES = ["helm", "bigbench", "harness", "lighteval", "original", "custom", "community"]
 
 TRUNCATE_FEW_SHOTS_DEFAULTS = True
 
diff --git a/src/lighteval/tasks/tasks_prompt_formatting.py b/src/lighteval/tasks/tasks_prompt_formatting.py
@@ -2017,7 +2017,17 @@ def wic(line, task_name: str = None):
     )
 
 
-def wikitext(line, task_name: str = None):  # perplexity metric
+def wikifact(line, task_name: str = None):
+    return Doc(task_name=task_name, query=f"{line['question']} ", gold_index=0, choices=[line["references"]])
+
+
+def wikitext(line, task_name: str = None):
+    if line["text"] == "" or line["text"][0] == "=":
+        return None
+    return Doc(task_name=task_name, query=f"{line['text']} ", gold_index=0, choices=None)
+
+
+def wikitext_harness(line, task_name: str = None):  # perplexity metric
     def wikitext_detokenizer(cur_string):
         # contractions
         cur_string = cur_string.replace("s '", "s'")
@@ -2060,12 +2070,8 @@ def wikitext_detokenizer(cur_string):
     )
 
 
-def wikifact(line, task_name: str = None):
-    return Doc(task_name=task_name, query=f"{line['question']} ", gold_index=0, choices=[line["references"]])
-
-
-def wikitext_103(line, task_name: str = None):
-    return Doc(task_name=task_name, query=line["text"])
+def wikitext_helm(line, task_name: str = None):
+    return Doc(task_name=task_name, choices=[""], gold_index=0, query=line["page"])
 
 
 def winogrande(line, task_name: str = None):
diff --git a/src/lighteval/tasks/tasks_table.jsonl b/src/lighteval/tasks/tasks_table.jsonl
@@ -939,8 +939,9 @@
 {"name":"wikifact:time_of_discovery_or_invention","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"time_of_discovery_or_invention","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
 {"name":"wikifact:twinned_administrative_body","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"twinned_administrative_body","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
 {"name":"wikifact:work_location","suite":["helm"],"prompt_function":"wikifact","hf_repo":"lighteval\/wikifact","hf_subset":"work_location","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":8,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
-{"name":"wikitext","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
-{"name":"wikitext:103","suite":["helm"],"prompt_function":"wikitext_103","hf_repo":"lighteval\/wikitext_103","hf_subset":"default","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
+{"name":"wikitext:2","suite":["lighteval"],"prompt_function":"wikitext","hf_repo":"wikitext","hf_subset":"wikitext-2-raw-v1","hf_avail_splits":["train","validation","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
+{"name":"wikitext:103:document_level","suite":["harness"],"prompt_function":"wikitext_harness","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
+{"name":"wikitext:103:document_level","suite":["helm"],"prompt_function":"wikitext_helm","hf_repo":"EleutherAI\/wikitext_document_level","hf_subset":"wikitext-103-raw-v1","hf_avail_splits":["train","test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["word_perplexity","byte_perplexity","bits_per_byte"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
 {"name":"wino_x_german","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"wino_x_german","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
 {"name":"winogrande","suite":["lighteval"],"prompt_function":"winogrande","hf_repo":"winogrande","hf_subset":"winogrande_xl","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation"],"few_shots_split":null,"few_shots_select":"random_sampling","generation_size":-1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
 {"name":"winowhy","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_whitespace_after_query","hf_repo":"bigbench","hf_subset":"winowhy","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["loglikelihood_acc"],"stop_sequence":["\n"],"output_regex":null,"frozen":false}
diff --git a/tests/reference_scores/harness_prompts.json b/tests/reference_scores/harness_prompts.json
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:65d1d1bfb86c291b80e4117845385c6af0d965dc0af5e7a05774c820b772ba98
-size 20246116
+oid sha256:02a5551e1137c799c9a1535112d221c7a77fd07b72c2b38b640164be7ea70828
+size 20246141