Skip to content

Commit 420e8ed

Browse files
authored
add seqio mixtures for bias/fairness eval (#530)
* add seqio mixtures for bias/fairness eval * style
1 parent 80145e5 commit 420e8ed

File tree

2 files changed

+46
-17
lines changed

2 files changed

+46
-17
lines changed

promptsource/seqio_tasks/experiment_D4.csv

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
22
crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
33
jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
4-
super_glue,axg,bias_and_fairness,cls,test set only,,,,TRUE,,,,,,,,,,,,,,,,
4+
super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
55
winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
6-
winobias,,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
6+
wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
7+
wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
8+
wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
9+
wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
710
super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
811
winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
912
winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
@@ -17,11 +20,11 @@ super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
1720
glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
1821
glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
1922
paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
20-
ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
21-
chooses the correct answer and 1/k if it reports a k-way tie
23+
ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
24+
chooses the correct answer and 1/k if it reports a k-way tie
2225
(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
23-
ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
24-
chooses the correct answer and 1/k if it reports a k-way tie
26+
ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
27+
chooses the correct answer and 1/k if it reports a k-way tie
2528
(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
2629
nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
2730
kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
@@ -31,13 +34,13 @@ wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclantholo
3134
adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
3235
adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
3336
adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
34-
coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
35-
against n human answers resulting in n F1 scores,
36-
the maximum of which is chosen as the prediction’s
37-
F1.For each question, we average out F1 across
38-
these n sets, both for humans and models. In our
39-
final evaluation, we use n = 4 human answers for
40-
every question (the original answer and 3 additionally collected answers). The articles a, an and the
37+
coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
38+
against n human answers resulting in n F1 scores,
39+
the maximum of which is chosen as the prediction’s
40+
F1.For each question, we average out F1 across
41+
these n sets, both for humans and models. In our
42+
final evaluation, we use n = 4 human answers for
43+
every question (the original answer and 3 additionally collected answers). The articles a, an and the
4144
and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
4245
duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
4346
duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
@@ -55,8 +58,8 @@ drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even
5558
cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
5659
cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
5760
dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
58-
openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
59-
chooses the correct answer and 1/k if it reports a k-way tie
61+
openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
62+
chooses the correct answer and 1/k if it reports a k-way tie
6063
(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
6164
qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
6265
quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
@@ -235,4 +238,4 @@ glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-g
235238
,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
236239
,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
237240
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
238-
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,
241+
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,

promptsource/seqio_tasks/tasks.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
144144
d4_eval: List[datatset_subset_tuple] = []
145145
d3_train_gpt: List[datatset_subset_tuple] = []
146146
d3_train_sglue: List[datatset_subset_tuple] = []
147+
bias_fairness_eval: List[datatset_subset_tuple] = []
147148
gsheet: Dict[datatset_subset_tuple, Dict] = {}
148149
experiment_path = pkg_resources.resource_filename(__name__, "experiment_D4.csv")
149150
with open(experiment_path) as exp_file:
@@ -162,8 +163,14 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
162163
d3_train_gpt.append(dataset_subset)
163164
if row["D3_do_train"] == "TRUE" and row["HF_name"] == "super_glue":
164165
d3_train_sglue.append(dataset_subset)
166+
if (
167+
row["do_eval"] == "TRUE"
168+
and row["task_by_convention"] == "bias_and_fairness"
169+
and row["HF_name"] != "winogender"
170+
):
171+
bias_fairness_eval.append(dataset_subset)
165172
gsheet[dataset_subset] = row
166-
all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue
173+
all_datasets = d4_train + d4_eval + d3_train_gpt + d3_train_sglue + bias_fairness_eval
167174

168175
all_templates = promptsource.templates.TemplateCollection()
169176
all_templates.remove("anli") # Need to special-case ANLI due to weird split conventions
@@ -173,6 +180,7 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
173180
gpt_train_mixture: List[str] = []
174181
sglue_train_mixture: List[str] = []
175182
d4_eval_mixture: List[str] = []
183+
bias_fairness_eval_mixture: List[str] = []
176184
mixture_cap: Dict[str, int] = {}
177185
single_original_task: Dict[Tuple[str, str], str] = {}
178186
all_original_tasks: List[str] = []
@@ -218,6 +226,8 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
218226
if template.metadata.original_task:
219227
d4_eval_mixture.append(task_name)
220228
# TODO use template.metadata.answer_choices or answer_choice_keys here for rank eval
229+
if (dataset_name, subset_name) in bias_fairness_eval:
230+
bias_fairness_eval_mixture.append(task_name)
221231

222232
# Special case for ANLI, which has weirdly-named splits and rounds that should be subsets
223233
dataset_name, subset_name = ("anli", None)
@@ -393,3 +403,19 @@ def add_task(dataset_name, subset_name, template_name, task_name=None, split_map
393403
[task for task in all_original_tasks if task in d4_train_mixture and task not in TASK_BLACKLIST],
394404
default_rate=lambda t: mixture_cap[t.name],
395405
)
406+
407+
seqio.MixtureRegistry.add(
408+
"bias_fairness_eval",
409+
bias_fairness_eval_mixture,
410+
default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
411+
)
412+
413+
seqio.MixtureRegistry.add(
414+
"bias_fairness_eval_score_eval",
415+
[
416+
task
417+
for task in seqio.TaskRegistry.names()
418+
if task.endswith("_score_eval") and task.split("_score_eval")[0] in bias_fairness_eval_mixture
419+
],
420+
default_rate=functools.partial(seqio.mixing_rate_num_examples, maximum=500_000),
421+
)

0 commit comments

Comments
 (0)