Skip to content

Commit eb0d898

Browse files
Adding BBH subsets back (#126)
Fixes #125, as the subset were removed in a merge, and edits the test suite accordingly
1 parent 9a1697d commit eb0d898

File tree

3 files changed

+72
-31
lines changed

3 files changed

+72
-31
lines changed

src/lighteval/tasks/tasks_table.jsonl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,42 @@
4545
{"name":"auto_categorization","suite":["bigbench","bigbench_json"],"prompt_function":"bigbench","hf_repo":"bigbench","hf_subset":"auto_categorization","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":1,"metric":["bleu"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
4646
{"name":"auto_debugging","suite":["bigbench_lite","bigbench","bigbench_json"],"prompt_function":"bigbench_linefeed_before_and_after_query","hf_repo":"bigbench","hf_subset":"auto_debugging","hf_avail_splits":["default","train","validation"],"evaluation_splits":["default"],"few_shots_split":null,"few_shots_select":null,"generation_size":100,"metric":["perfect_exact_match"],"stop_sequence":null,"output_regex":"[^\\.\\?\\!\\;\\n]+", "trust_dataset": true}
4747
{"name":"babi_qa","suite":["helm"],"prompt_function":"babi_qa","hf_repo":"facebook\/babi_qa","hf_subset":"en-valid-qa1","hf_avail_splits":["train","test","validation"],"evaluation_splits":["validation","test"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match"],"stop_sequence":["\n"],"output_regex":null,"frozen":false, "trust_dataset": true}
48+
{"name":"bigbench:causal_judgment","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
49+
{"name":"bigbench:date_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
50+
{"name":"bigbench:disambiguation_qa","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
51+
{"name":"bigbench:geometric_shapes","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
52+
{"name":"bigbench:logical_deduction_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
53+
{"name":"bigbench:logical_deduction_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
54+
{"name":"bigbench:logical_deduction_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
55+
{"name":"bigbench:movie_recommendation","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
56+
{"name":"bigbench:navigate","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
57+
{"name":"bigbench:reasoning_about_colored_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
58+
{"name":"bigbench:ruin_names","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
59+
{"name":"bigbench:salient_translation_error_detection","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
60+
{"name":"bigbench:snarks","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
61+
{"name":"bigbench:sports_understanding","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
62+
{"name":"bigbench:temporal_sequences","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
63+
{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
64+
{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
65+
{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["lighteval"],"prompt_function":"bbh_lighteval","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc_single_token"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
66+
{"name":"bigbench:causal_judgment","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"causal_judgement","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
67+
{"name":"bigbench:date_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"date_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
68+
{"name":"bigbench:disambiguation_qa","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"disambiguation_qa","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
69+
{"name":"bigbench:geometric_shapes","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"geometric_shapes","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
70+
{"name":"bigbench:logical_deduction_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
71+
{"name":"bigbench:logical_deduction_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
72+
{"name":"bigbench:logical_deduction_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"logical_deduction_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
73+
{"name":"bigbench:movie_recommendation","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"movie_recommendation","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
74+
{"name":"bigbench:navigate","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"navigate","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
75+
{"name":"bigbench:reasoning_about_colored_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"reasoning_about_colored_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
76+
{"name":"bigbench:ruin_names","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"ruin_names","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
77+
{"name":"bigbench:salient_translation_error_detection","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"salient_translation_error_detection","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
78+
{"name":"bigbench:snarks","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"snarks","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
79+
{"name":"bigbench:sports_understanding","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"sports_understanding","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
80+
{"name":"bigbench:temporal_sequences","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"temporal_sequences","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
81+
{"name":"bigbench:tracking_shuffled_objects_five_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_five_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
82+
{"name":"bigbench:tracking_shuffled_objects_seven_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_seven_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
83+
{"name":"bigbench:tracking_shuffled_objects_three_objects","suite":["harness"],"prompt_function":"bbh_harness","hf_repo":"lighteval/bbh","hf_subset":"tracking_shuffled_objects_three_objects","hf_avail_splits":["train"],"evaluation_splits":["train"],"few_shots_split":null,"few_shots_select":null,"generation_size":-1,"metric":["loglikelihood_acc","loglikelihood_acc_norm"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "must_remove_duplicate_docs": true, "trust_dataset":true}
4884
{"name":"bbh:boolean_expressions","suite":["harness"],"prompt_function":"bbh_boolean_expressions","hf_repo":"lukaemon/bbh","hf_subset":"boolean_expressions","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
4985
{"name":"bbh:causal_judgment","suite":["harness"],"prompt_function":"bbh_causal_judgment","hf_repo":"lukaemon/bbh","hf_subset":"causal_judgement","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}
5086
{"name":"bbh:date_understanding","suite":["harness"],"prompt_function":"bbh_date_understanding","hf_repo":"lukaemon/bbh","hf_subset":"date_understanding","hf_avail_splits":["test"],"evaluation_splits":["test"],"few_shots_split":null,"few_shots_select":null,"generation_size":20,"metric":["exact_match","quasi_exact_match","prefix_exact_match","prefix_quasi_exact_match","perfect_exact_match"],"stop_sequence":["</s>", "Q:", "\n\n"],"output_regex":null,"frozen":false, "trust_dataset":true}

0 commit comments

Comments
 (0)