EleutherAI
diff --git a/‎lm_eval/api/metrics.py‎
Lines changed: 10 additions & 0 deletions b/‎lm_eval/api/metrics.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎lm_eval/api/task.py‎
Lines changed: 5 additions & 0 deletions b/‎lm_eval/api/task.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lm_eval/tasks/README.md‎
Lines changed: 2 additions & 1 deletion b/‎lm_eval/tasks/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lm_eval/tasks/global_piqa/README.md‎
Lines changed: 49 additions & 0 deletions b/‎lm_eval/tasks/global_piqa/README.md‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎lm_eval/tasks/global_piqa/completions/_generate_config.py‎
Lines changed: 55 additions & 0 deletions b/‎lm_eval/tasks/global_piqa/completions/_generate_config.py‎
Lines changed: 55 additions & 0 deletions
@@ -179,6 +179,16 @@ def acc_mutual_info_fn(items):  # This is a passthrough function
     return items
 
 
+@register_metric(
+    metric="acc_bytes",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_bytes_fn(items):  # This is a passthrough function
+    return items
+
+
 ### the code used in the `exact_match_hf_evaluate` function is ported from
 ### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
 ### which is under the apache license.
 
@@ -1582,6 +1582,7 @@ def process_results(self, doc, results):
             # retrieve choices in List[str] form, to compute choice lengths, etc.
             choices = self.doc_to_choice(doc)
             completion_len = np.array([float(len(i)) for i in choices])
+            byte_length = np.array([float(len(i.encode("utf-8"))) for i in choices])
 
             if (
                 2 * len(choices) == len(lls)
@@ -1598,6 +1599,7 @@ def process_results(self, doc, results):
 
             pred = np.argmax(lls)
             pred_norm = np.argmax(lls / completion_len)
+            pred_byte = np.argmax(lls / byte_length)
 
             if self.multiple_input:
                 gold = self.doc_to_text(doc)
@@ -1627,10 +1629,12 @@ def process_results(self, doc, results):
             if self.multiple_target:
                 acc = 1.0 if pred in gold else 0.0
                 acc_norm = 1.0 if pred_norm in gold else 0.0
+                acc_bytes = 1.0 if pred_byte in gold else 0.0
                 exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
             else:
                 acc = 1.0 if pred == gold else 0.0
                 acc_norm = 1.0 if pred_norm == gold else 0.0
+                acc_bytes = 1.0 if pred_byte == gold else 0.0
                 # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
                 exact_match = int(is_greedy[gold]) if gold != -100 else 0
 
@@ -1643,6 +1647,7 @@ def process_results(self, doc, results):
                 **({"f1": (gold, pred)} if "f1" in use_metric else {}),
                 **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
                 **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"acc_bytes": acc_bytes} if "acc_bytes" in use_metric else {}),
                 **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
                 **(
                     {"brier_score": (gold, prob_norm)}
 
@@ -26,7 +26,7 @@ provided to the individual README.md files for each subfolder.
 | [asdiv](asdiv/README.md)                                                 | Tasks involving arithmetic and mathematical reasoning challenges.                                                                                                                                                                                                                                                                      | English                                                                                                                                                                                                                                                       |
 | [babi](babi/README.md)                                                   | Tasks designed as question and answering challenges based on simulated stories.                                                                                                                                                                                                                                                        | English                                                                                                                                                                                                                                                       |
 | [babilong](babilong/README.md)                                           | Tasks designed to test whether models can find and reason over facts in long contexts.                                                                                                                                                                                                                                                 | English                                                                                                                                                                                                                                                       |
-| [bangla_mmlu](bangla/README.md)                                              |    Benchmark dataset for evaluating language models' performance on Bangla (Bengali) language tasks.Includes diverse NLP tasks to measure model understanding and generation capabilities in Bangla.                                                                    |                               Bengali/Bangla                                                                     |  
+| [bangla_mmlu](bangla/README.md)                                          | Benchmark dataset for evaluating language models' performance on Bangla (Bengali) language tasks.Includes diverse NLP tasks to measure model understanding and generation capabilities in Bangla.                                                                                                                                      | Bengali/Bangla                                                                                                                                                                                                                                                |  
 | [basque_bench](basque_bench/README.md)                                   | Collection of tasks in Basque encompassing various evaluation areas.                                                                                                                                                                                                                                                                   | Basque                                                                                                                                                                                                                                                        |
 | [basqueglue](basqueglue/README.md)                                       | Tasks designed to evaluate language understanding in Basque language.                                                                                                                                                                                                                                                                  | Basque                                                                                                                                                                                                                                                        |
 | [bbh](bbh/README.md)                                                     | Tasks focused on deep semantic understanding through hypothesization and reasoning.                                                                                                                                                                                                                                                    | English, German                                                                                                                                                                                                                                               |
@@ -70,6 +70,7 @@ provided to the individual README.md files for each subfolder.
 | [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French.                                                                                                                                                                                                                                                                  | French                                                                                                                                                                                                                                                        |
 | [galician_bench](galician_bench/README.md)                               | Collection of tasks in Galician encompassing various evaluation areas.                                                                                                                                                                                                                                                                 | Galician                                                                                                                                                                                                                                                      |
 | [global_mmlu](global_mmlu/README.md)                                     | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits.                                                                                                                                                                                                           | Multiple (15 languages)                                                                                                                                                                                                                                       |
+| [global_piqa](global_piqa/README.md)                                     | Multilingual (non-parallel) commonsense reasoning benchmark covering 116 language varieties with culturally-specific examples from 65 countries                                                                                                                                                                                        | Multiple (116 languages)   **Human authored**                                                                                                                                                                                                                 |
 | [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities.                                                                                                                                                                                                                                                  | English                                                                                                                                                                                                                                                       |
 | [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification.                                                                                                                                                                                                                                                       | English                                                                                                                                                                                                                                                       |
 | [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities.                                                                                                                                                                                                                                                  | English                                                                                                                                                                                                                                                       |
 
@@ -0,0 +1,49 @@
+# Task-name
+
+## Paper
+
+Title: `Global PIQA`
+
+Abstract: `To date, there exist almost no culturally-specific evaluation benchmarks for large language models (LLMs) that cover a large number of languages and cultures. We present Global PIQA, a participatory commonsense reasoning benchmark for over 100 languages, constructed by hand by 320 researchers from 65 countries around the world. The 116 language varieties in Global PIQA cover five continents, 14 language families, and 23 writing systems. In the non-parallel split of Global PIQA, over 50% of examples reference local foods, customs, traditions, or other culturally-specific elements. Beyond its uses for LLM evaluation, we hope that Global PIQA provides a glimpse into the wide diversity of cultures in which human language is embedded.`
+
+`Short description of paper / benchmark goes here:`
+
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+
+### Citation
+
+```text
+BibTeX-formatted citation goes here
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `group_name`: `global_piqa_completions` Generation task using chat format
+* `group_name`: `global_piqa_prompted` Cloze-style completion format
+
+#### Tags
+
+* `tag_name`: `Short description`
+
+#### Tasks
+
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: ...
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
@@ -0,0 +1,55 @@
+from pathlib import Path
+
+import datasets
+import yaml
+
+
+class IndentedDumper(yaml.Dumper):
+    def increase_indent(self, flow=False, indentless=False):
+        return super(IndentedDumper, self).increase_indent(flow, False)
+
+
+PREFACE = "global_piqa_completions"
+
+
+def format_subset(subset: str, preface: str = PREFACE) -> str:
+    return f"{preface}_{subset}"
+
+
+if __name__ == "__main__":
+    subsets = [
+        x
+        for x in datasets.get_dataset_config_names(
+            "mrlbenchmarks/global-piqa-nonparallel"
+        )
+        if not x.startswith("dev")
+    ]
+    PARENT = Path(__file__).parent
+    for s in subsets:
+        with open(PARENT / f"{s}.yaml", "w") as f:
+            yaml.dump(
+                {
+                    "include": "_template",
+                    "task": format_subset(s),
+                    "dataset_name": s,
+                },
+                f,
+            )
+
+    with open(PARENT / "_global_piqa.yaml", "w") as f:
+        yaml.dump(
+            {
+                "group": f"{PREFACE}",
+                "task": [{"task": format_subset(s), "task_alias": s} for s in subsets],
+                "aggregate_metric_list": [
+                    {"metric": m, "aggregation": "mean", "weight_by_size": True}
+                    for m in ["acc", "acc_norm", "acc_bytes"]
+                ],
+            },
+            f,
+            Dumper=IndentedDumper,
+            default_flow_style=False,
+            sort_keys=False,
+        )
+        f.write("metadata:\n")
+        f.write("  version: 1.0\n")