EleutherAI
diff --git a/‎lm_eval/tasks/README.md‎
Lines changed: 1 addition & 0 deletions b/‎lm_eval/tasks/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/README.md‎
Lines changed: 48 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/README.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/_uncheatable_eval_base.yaml‎
Lines changed: 22 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/_uncheatable_eval_base.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval.yaml‎
Lines changed: 22 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval.yaml‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_ao3_chinese.yaml‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_ao3_chinese.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_ao3_english.yaml‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_ao3_english.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_arxiv_computer_science.yaml‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_arxiv_computer_science.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_arxiv_physics.yaml‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_arxiv_physics.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_bbc_news.yaml‎
Lines changed: 9 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_bbc_news.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_full.yaml‎
Lines changed: 29 additions & 0 deletions b/‎lm_eval/tasks/uncheatable_eval/uncheatable_eval_full.yaml‎
Lines changed: 29 additions & 0 deletions
@@ -179,6 +179,7 @@ provided to the individual README.md files for each subfolder.
 | [truthfulqa-multi](truthfulqa-multi/README.md)                           | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                       | English, Spanish, Catalan, Basque, Galician                                                                                                                                                                                                                   |
 | [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                                                                                                                                                                       |
 | [turblimp_core](turblimp/README.md)                                      | A benchmark evaluating language models' grammatical capabilities in Turkish based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences.                                                                                                                                                          | Turkish                                                                                                                                                                                                                                                       |
+| [uncheatable_eval](uncheatable_eval/README.md)                           | Rolling perplexity benchmark built from Uncheatable Eval dumps covering Wikipedia, GitHub, BBC, arXiv, and AO3 domains scraped after mid-2024.                                                                                                                                                                                         | English, Spanish, French, German, Japanese, Arabic, Chinese, Python, C++                                                                                                                                                                                       |
 | [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                                                                                                                                                                       |
 | [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                                                                                                                                                                       |
 | [webqs](webqs/README.md)                                                 | Web-based question answering tasks designed to evaluate internet search and retrieval.                                                                                                                                                                                                                                                 | English                                                                                                                                                                                                                                                       |
 
@@ -0,0 +1,48 @@
+# Uncheatable Eval
+
+These tasks evaluate autoregressive language models on [Uncheatable Eval](https://github.com/Jellyfish042/uncheatable_eval). Each task measures rolling log-likelihood over newly-generated documents across Wikipedia, GitHub, BBC, arXiv, and AO3. 
+
+### Citation
+
+```text
+@software{uncheatable_eval,
+  author       = {Jellyfish042},
+  title        = {Uncheatable Eval},
+  month        = may,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {0.1},
+  doi          = {10.5281/zenodo.11284692},
+  url          = {https://zenodo.org/record/11284692}
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `uncheatable_eval`: aggregating Wikipedia (English), GitHub (Python/C++), BBC News, arXiv (physics + CS), and AO3 (English).
+* `uncheatable_eval_full`: spanning every available Uncheatable Eval dump, including all supported Wikipedia languages plus GitHub, BBC, arXiv, and AO3 (English + Chinese).
+
+#### Tags
+
+* `uncheatable_eval`
+
+#### Tasks
+
+* `uncheatable_eval_wikipedia_english`
+* `uncheatable_eval_wikipedia_spanish`
+* `uncheatable_eval_wikipedia_french`
+* `uncheatable_eval_wikipedia_german`
+* `uncheatable_eval_wikipedia_japanese`
+* `uncheatable_eval_wikipedia_arabic`
+* `uncheatable_eval_wikipedia_chinese`
+* `uncheatable_eval_github_python`
+* `uncheatable_eval_github_cpp`
+* `uncheatable_eval_bbc_news`
+* `uncheatable_eval_arxiv_physics`
+* `uncheatable_eval_arxiv_computer_science`
+* `uncheatable_eval_ao3_english`
+* `uncheatable_eval_ao3_chinese`
+
+### Changelog
@@ -0,0 +1,22 @@
+output_type: loglikelihood_rolling
+test_split: test
+doc_to_text: ""
+doc_to_target: "{{text}}"
+description: >-
+  Rolling log-likelihood evaluation over deduplicated Uncheatable Eval
+  documents sourced from freshly scraped corpora.
+tag:
+  - uncheatable_eval
+should_decontaminate: false
+metric_list:
+  - metric: word_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: byte_perplexity
+    aggregation: weighted_perplexity
+    higher_is_better: false
+  - metric: bits_per_byte
+    aggregation: bits_per_byte
+    higher_is_better: false
+metadata:
+  version: 1.0
@@ -0,0 +1,22 @@
+group: uncheatable_eval
+group_alias: Uncheatable Eval (core domains)
+task:
+  - uncheatable_eval_wikipedia_english
+  - uncheatable_eval_github_python
+  - uncheatable_eval_github_cpp
+  - uncheatable_eval_bbc_news
+  - uncheatable_eval_arxiv_physics
+  - uncheatable_eval_arxiv_computer_science
+  - uncheatable_eval_ao3_english
+aggregate_metric_list:
+  - metric: word_perplexity
+    aggregation: mean
+    weight_by_size: true
+  - metric: byte_perplexity
+    aggregation: mean
+    weight_by_size: true
+  - metric: bits_per_byte
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
@@ -0,0 +1,9 @@
+include: _uncheatable_eval_base.yaml
+task: uncheatable_eval_ao3_chinese
+task_alias: UE_ao3_zh
+description: >-
+  Rolling perplexity on Uncheatable Eval Archive of Our Own fanfiction (Chinese)
+  scraped after mid-2024.
+custom_dataset: !function uncheatable_eval_utils.load_uncheatable_eval
+dataset_kwargs:
+  dataset: ao3_chinese
@@ -0,0 +1,9 @@
+include: _uncheatable_eval_base.yaml
+task: uncheatable_eval_ao3_english
+task_alias: UE_ao3_en
+description: >-
+  Rolling perplexity on Uncheatable Eval Archive of Our Own fanfiction (English)
+  scraped after mid-2024.
+custom_dataset: !function uncheatable_eval_utils.load_uncheatable_eval
+dataset_kwargs:
+  dataset: ao3_english
@@ -0,0 +1,9 @@
+include: _uncheatable_eval_base.yaml
+task: uncheatable_eval_arxiv_computer_science
+task_alias: UE_arxiv_cs
+description: >-
+  Rolling perplexity on Uncheatable Eval arXiv computer science papers and
+  abstracts downloaded after mid-2024.
+custom_dataset: !function uncheatable_eval_utils.load_uncheatable_eval
+dataset_kwargs:
+  dataset: arxiv_computer_science
@@ -0,0 +1,9 @@
+include: _uncheatable_eval_base.yaml
+task: uncheatable_eval_arxiv_physics
+task_alias: UE_arxiv_ph
+description: >-
+  Rolling perplexity on Uncheatable Eval arXiv physics papers and abstracts
+  downloaded after mid-2024.
+custom_dataset: !function uncheatable_eval_utils.load_uncheatable_eval
+dataset_kwargs:
+  dataset: arxiv_physics
@@ -0,0 +1,9 @@
+include: _uncheatable_eval_base.yaml
+task: uncheatable_eval_bbc_news
+task_alias: UE_bbc_news
+description: >-
+  Rolling perplexity on Uncheatable Eval BBC News articles harvested after
+  mid-2024.
+custom_dataset: !function uncheatable_eval_utils.load_uncheatable_eval
+dataset_kwargs:
+  dataset: bbc_news
@@ -0,0 +1,29 @@
+group: uncheatable_eval_full
+group_alias: Uncheatable Eval (full)
+task:
+  - uncheatable_eval_wikipedia_english
+  - uncheatable_eval_wikipedia_spanish
+  - uncheatable_eval_wikipedia_french
+  - uncheatable_eval_wikipedia_german
+  - uncheatable_eval_wikipedia_japanese
+  - uncheatable_eval_wikipedia_arabic
+  - uncheatable_eval_wikipedia_chinese
+  - uncheatable_eval_github_python
+  - uncheatable_eval_github_cpp
+  - uncheatable_eval_bbc_news
+  - uncheatable_eval_arxiv_physics
+  - uncheatable_eval_arxiv_computer_science
+  - uncheatable_eval_ao3_english
+  - uncheatable_eval_ao3_chinese
+aggregate_metric_list:
+  - metric: word_perplexity
+    aggregation: mean
+    weight_by_size: true
+  - metric: byte_perplexity
+    aggregation: mean
+    weight_by_size: true
+  - metric: bits_per_byte
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0