Skip to content

Commit 03c44ad

Browse files
authored
add global_piqa; add acc_norm_bytes metric (#3368)
* add global_piqa * add `acc_bytes`
1 parent e916aa4 commit 03c44ad

File tree

242 files changed

+1403
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

242 files changed

+1403
-1
lines changed

lm_eval/api/metrics.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,16 @@ def acc_mutual_info_fn(items): # This is a passthrough function
179179
return items
180180

181181

182+
@register_metric(
183+
metric="acc_bytes",
184+
higher_is_better=True,
185+
output_type=["loglikelihood", "multiple_choice"],
186+
aggregation="mean",
187+
)
188+
def acc_bytes_fn(items): # This is a passthrough function
189+
return items
190+
191+
182192
### the code used in the `exact_match_hf_evaluate` function is ported from
183193
### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py
184194
### which is under the apache license.

lm_eval/api/task.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,6 +1582,7 @@ def process_results(self, doc, results):
15821582
# retrieve choices in List[str] form, to compute choice lengths, etc.
15831583
choices = self.doc_to_choice(doc)
15841584
completion_len = np.array([float(len(i)) for i in choices])
1585+
byte_length = np.array([float(len(i.encode("utf-8"))) for i in choices])
15851586

15861587
if (
15871588
2 * len(choices) == len(lls)
@@ -1598,6 +1599,7 @@ def process_results(self, doc, results):
15981599

15991600
pred = np.argmax(lls)
16001601
pred_norm = np.argmax(lls / completion_len)
1602+
pred_byte = np.argmax(lls / byte_length)
16011603

16021604
if self.multiple_input:
16031605
gold = self.doc_to_text(doc)
@@ -1627,10 +1629,12 @@ def process_results(self, doc, results):
16271629
if self.multiple_target:
16281630
acc = 1.0 if pred in gold else 0.0
16291631
acc_norm = 1.0 if pred_norm in gold else 0.0
1632+
acc_bytes = 1.0 if pred_byte in gold else 0.0
16301633
exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
16311634
else:
16321635
acc = 1.0 if pred == gold else 0.0
16331636
acc_norm = 1.0 if pred_norm == gold else 0.0
1637+
acc_bytes = 1.0 if pred_byte == gold else 0.0
16341638
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
16351639
exact_match = int(is_greedy[gold]) if gold != -100 else 0
16361640

@@ -1643,6 +1647,7 @@ def process_results(self, doc, results):
16431647
**({"f1": (gold, pred)} if "f1" in use_metric else {}),
16441648
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
16451649
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
1650+
**({"acc_bytes": acc_bytes} if "acc_bytes" in use_metric else {}),
16461651
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
16471652
**(
16481653
{"brier_score": (gold, prob_norm)}

lm_eval/tasks/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ provided to the individual README.md files for each subfolder.
2626
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
2727
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
2828
| [babilong](babilong/README.md) | Tasks designed to test whether models can find and reason over facts in long contexts. | English |
29-
| [bangla_mmlu](bangla/README.md) | Benchmark dataset for evaluating language models' performance on Bangla (Bengali) language tasks.Includes diverse NLP tasks to measure model understanding and generation capabilities in Bangla. | Bengali/Bangla |
29+
| [bangla_mmlu](bangla/README.md) | Benchmark dataset for evaluating language models' performance on Bangla (Bengali) language tasks.Includes diverse NLP tasks to measure model understanding and generation capabilities in Bangla. | Bengali/Bangla |
3030
| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
3131
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
3232
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
@@ -70,6 +70,7 @@ provided to the individual README.md files for each subfolder.
7070
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French |
7171
| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
7272
| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) |
73+
| [global_piqa](global_piqa/README.md) | Multilingual (non-parallel) commonsense reasoning benchmark covering 116 language varieties with culturally-specific examples from 65 countries | Multiple (116 languages) **Human authored** |
7374
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
7475
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
7576
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Task-name
2+
3+
## Paper
4+
5+
Title: `Global PIQA`
6+
7+
Abstract: `To date, there exist almost no culturally-specific evaluation benchmarks for large language models (LLMs) that cover a large number of languages and cultures. We present Global PIQA, a participatory commonsense reasoning benchmark for over 100 languages, constructed by hand by 320 researchers from 65 countries around the world. The 116 language varieties in Global PIQA cover five continents, 14 language families, and 23 writing systems. In the non-parallel split of Global PIQA, over 50% of examples reference local foods, customs, traditions, or other culturally-specific elements. Beyond its uses for LLM evaluation, we hope that Global PIQA provides a glimpse into the wide diversity of cultures in which human language is embedded.`
8+
9+
`Short description of paper / benchmark goes here:`
10+
11+
Homepage: `homepage to the benchmark's website goes here, if applicable`
12+
13+
### Citation
14+
15+
```text
16+
BibTeX-formatted citation goes here
17+
```
18+
19+
### Groups, Tags, and Tasks
20+
21+
#### Groups
22+
23+
* `group_name`: `global_piqa_completions` Generation task using chat format
24+
* `group_name`: `global_piqa_prompted` Cloze-style completion format
25+
26+
#### Tags
27+
28+
* `tag_name`: `Short description`
29+
30+
#### Tasks
31+
32+
* `task_name`: `1-sentence description of what this particular task does`
33+
* `task_name2`: ...
34+
35+
### Checklist
36+
37+
For adding novel benchmarks/datasets to the library:
38+
39+
* [x] Is the task an existing benchmark in the literature?
40+
* [x] Have you referenced the original paper that introduced the task?
41+
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
42+
43+
If other tasks on this dataset are already supported:
44+
45+
* [x] Is the "Main" variant of this task clearly denoted?
46+
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
47+
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
48+
49+
### Changelog
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from pathlib import Path
2+
3+
import datasets
4+
import yaml
5+
6+
7+
class IndentedDumper(yaml.Dumper):
8+
def increase_indent(self, flow=False, indentless=False):
9+
return super(IndentedDumper, self).increase_indent(flow, False)
10+
11+
12+
PREFACE = "global_piqa_completions"
13+
14+
15+
def format_subset(subset: str, preface: str = PREFACE) -> str:
16+
return f"{preface}_{subset}"
17+
18+
19+
if __name__ == "__main__":
20+
subsets = [
21+
x
22+
for x in datasets.get_dataset_config_names(
23+
"mrlbenchmarks/global-piqa-nonparallel"
24+
)
25+
if not x.startswith("dev")
26+
]
27+
PARENT = Path(__file__).parent
28+
for s in subsets:
29+
with open(PARENT / f"{s}.yaml", "w") as f:
30+
yaml.dump(
31+
{
32+
"include": "_template",
33+
"task": format_subset(s),
34+
"dataset_name": s,
35+
},
36+
f,
37+
)
38+
39+
with open(PARENT / "_global_piqa.yaml", "w") as f:
40+
yaml.dump(
41+
{
42+
"group": f"{PREFACE}",
43+
"task": [{"task": format_subset(s), "task_alias": s} for s in subsets],
44+
"aggregate_metric_list": [
45+
{"metric": m, "aggregation": "mean", "weight_by_size": True}
46+
for m in ["acc", "acc_norm", "acc_bytes"]
47+
],
48+
},
49+
f,
50+
Dumper=IndentedDumper,
51+
default_flow_style=False,
52+
sort_keys=False,
53+
)
54+
f.write("metadata:\n")
55+
f.write(" version: 1.0\n")

0 commit comments

Comments
 (0)