|
| 1 | +# All Supervised Datasets |
| 2 | + |
| 3 | +Inspired by [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2), we fine-tuned Indonesian sentence embedding models on a set of existing supervised datasets. The tasks included in the training dataset are: question-answering, textual entailment, retrieval, commonsense reasoning, and natural language inference. Currently, our script simply concatenates these datasets and our models are trained conventionally using the `MultipleNegativesRankingLoss`. |
| 4 | + |
| 5 | +## Training Data |
| 6 | + |
| 7 | +| Dataset | Task | Data Instance | Number of Training Tuples | |
| 8 | +| ------------------------------------------------------------------------------------ | :---------------------------: | :-------------------------------------------: | :-----------------------: | |
| 9 | +| [indonli](https://huggingface.co/datasets/indonli) | Natural Language Inference | `(premise, entailment, contradiction)` | 3,914 | |
| 10 | +| [indolem/indo_story_cloze](https://huggingface.co/datasets/indolem/indo_story_cloze) | Commonsense Reasoning | `(context, correct ending, incorrect ending)` | 1,000 | |
| 11 | +| [unicamp-dl/mmarco](https://huggingface.co/datasets/unicamp-dl/mmarco) | Passage Retrieval | `(query, positive passage, negative passage)` | 100,000 | |
| 12 | +| [miracl/miracl](https://huggingface.co/datasets/miracl/miracl) | Passage Retrieval | `(query, positive passage, negative passage)` | 8,086 | |
| 13 | +| [SEACrowd/wrete](https://huggingface.co/datasets/SEACrowd/wrete) | Textual Entailment | `(sentenceA, sentenceB)` | 183 | |
| 14 | +| [SEACrowd/indolem_ntp](https://huggingface.co/datasets/SEACrowd/indolem_ntp) | Textual Entailment | `(tweet, next tweet)` | 5,681 | |
| 15 | +| [khalidalt/tydiqa-goldp](https://huggingface.co/datasets/khalidalt/tydiqa-goldp) | Extractive Question-Answering | `(question, passage)`, `(question, answer)` | 11,404 | |
| 16 | +| [SEACrowd/facqa](https://huggingface.co/datasets/SEACrowd/facqa) | Extractive Question-Answering | `(question, passage)`, `(question, answer)` | 4,990 | |
| 17 | +| **Total** | | | **135,258** | |
| 18 | + |
| 19 | +## All Supervised Datasets with MultipleNegativesRankingLoss |
| 20 | + |
| 21 | +### IndoBERT Base |
| 22 | + |
| 23 | +```sh |
| 24 | +python train_all_mnrl.py \ |
| 25 | + --model-name indobenchmark/indobert-base-p1 \ |
| 26 | + --max-seq-length 128 \ |
| 27 | + --num-epochs 5 \ |
| 28 | + --train-batch-size-pairs 384 \ |
| 29 | + --train-batch-size-triplets 256 \ |
| 30 | + --learning-rate 2e-5 |
| 31 | +``` |
| 32 | + |
| 33 | +## References |
| 34 | + |
| 35 | +```bibtex |
| 36 | +@inproceedings{mahendra-etal-2021-indonli, |
| 37 | + title="{I}ndo{NLI}: A Natural Language Inference Dataset for {I}ndonesian", |
| 38 | + author="Mahendra, Rahmad and Aji, Alham Fikri and Louvan, Samuel and Rahman, Fahrurrozi and Vania, Clara", |
| 39 | + booktitle="Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", |
| 40 | + month=nov, |
| 41 | + year="2021", |
| 42 | + address="Online and Punta Cana, Dominican Republic", |
| 43 | + publisher="Association for Computational Linguistics", |
| 44 | + url="https://aclanthology.org/2021.emnlp-main.821", |
| 45 | + pages="10511--10527", |
| 46 | +} |
| 47 | +``` |
| 48 | + |
| 49 | +```bibtex |
| 50 | +@inproceedings{koto2022cloze, |
| 51 | + title={Cloze evaluation for deeper understanding of commonsense stories in Indonesian}, |
| 52 | + author={Koto, Fajri and Baldwin, Timothy and Lau, Jey Han}, |
| 53 | + booktitle={Proceedings of the First Workshop on Commonsense Representation and Reasoning (CSRR 2022)}, |
| 54 | + pages={8--16}, |
| 55 | + year={2022} |
| 56 | +} |
| 57 | +``` |
| 58 | + |
| 59 | +```bibtex |
| 60 | +@misc{bonifacio2021mmarco, |
| 61 | + title={mMARCO: A Multilingual Version of MS MARCO Passage Ranking Dataset}, |
| 62 | + author={Luiz Henrique Bonifacio and Vitor Jeronymo and Hugo Queiroz Abonizio and Israel Campiotti and Marzieh Fadaee and and Roberto Lotufo and Rodrigo Nogueira}, |
| 63 | + year={2021}, |
| 64 | + eprint={2108.13897}, |
| 65 | + archivePrefix={arXiv}, |
| 66 | + primaryClass={cs.CL} |
| 67 | +} |
| 68 | +``` |
| 69 | + |
| 70 | +```bibtex |
| 71 | +@article{10.1162/tacl_a_00595, |
| 72 | + author={Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy}, |
| 73 | + title="{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}", |
| 74 | + journal={Transactions of the Association for Computational Linguistics}, |
| 75 | + volume={11}, |
| 76 | + pages={1114-1131}, |
| 77 | + year={2023}, |
| 78 | + month={09}, |
| 79 | + issn={2307-387X}, |
| 80 | + doi={10.1162/tacl_a_00595}, |
| 81 | + url={https://doi.org/10.1162/tacl\_a\_00595}, |
| 82 | + eprint={https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf}, |
| 83 | +} |
| 84 | +``` |
| 85 | + |
| 86 | +```bibtex |
| 87 | +@inproceedings{wilie2020indonlu, |
| 88 | + title={IndoNLU: Benchmark and Resources for Evaluating Indonesian Natural Language Understanding}, |
| 89 | + author={Wilie, Bryan and Vincentio, Karissa and Winata, Genta Indra and Cahyawijaya, Samuel and Li, Xiaohong and Lim, Zhi Yuan and Soleman, Sidik and Mahendra, Rahmad and Fung, Pascale and Bahar, Syafri and others}, |
| 90 | + booktitle={Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing}, |
| 91 | + pages={843--857}, |
| 92 | + year={2020} |
| 93 | +} |
| 94 | +``` |
| 95 | + |
| 96 | +```bibtex |
| 97 | +@article{DBLP:journals/corr/abs-2011-00677, |
| 98 | + author = {Fajri Koto and |
| 99 | + Afshin Rahimi and |
| 100 | + Jey Han Lau and |
| 101 | + Timothy Baldwin}, |
| 102 | + title = {IndoLEM and IndoBERT: {A} Benchmark Dataset and Pre-trained Language |
| 103 | + Model for Indonesian {NLP}}, |
| 104 | + journal = {CoRR}, |
| 105 | + volume = {abs/2011.00677}, |
| 106 | + year = {2020}, |
| 107 | + url = {https://arxiv.org/abs/2011.00677}, |
| 108 | + eprinttype = {arXiv}, |
| 109 | + eprint = {2011.00677}, |
| 110 | + timestamp = {Fri, 06 Nov 2020 15:32:47 +0100}, |
| 111 | + biburl = {https://dblp.org/rec/journals/corr/abs-2011-00677.bib}, |
| 112 | + bibsource = {dblp computer science bibliography, https://dblp.org} |
| 113 | +} |
| 114 | +``` |
| 115 | + |
| 116 | +```bibtex |
| 117 | +@inproceedings{ruder-etal-2021-xtreme, |
| 118 | + title = "{XTREME}-{R}: Towards More Challenging and Nuanced Multilingual Evaluation", |
| 119 | + author = "Ruder, Sebastian and |
| 120 | + Constant, Noah and |
| 121 | + Botha, Jan and |
| 122 | + Siddhant, Aditya and |
| 123 | + Firat, Orhan and |
| 124 | + Fu, Jinlan and |
| 125 | + Liu, Pengfei and |
| 126 | + Hu, Junjie and |
| 127 | + Garrette, Dan and |
| 128 | + Neubig, Graham and |
| 129 | + Johnson, Melvin", |
| 130 | + booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing", |
| 131 | + month = nov, |
| 132 | + year = "2021", |
| 133 | + address = "Online and Punta Cana, Dominican Republic", |
| 134 | + publisher = "Association for Computational Linguistics", |
| 135 | + url = "https://aclanthology.org/2021.emnlp-main.802", |
| 136 | + doi = "10.18653/v1/2021.emnlp-main.802", |
| 137 | + pages = "10215--10245", |
| 138 | +} |
| 139 | +``` |
| 140 | + |
| 141 | +```bibtex |
| 142 | +@inproceedings{purwarianti2007machine, |
| 143 | + title={A Machine Learning Approach for Indonesian Question Answering System}, |
| 144 | + author={Ayu Purwarianti, Masatoshi Tsuchiya, and Seiichi Nakagawa}, |
| 145 | + booktitle={Proceedings of Artificial Intelligence and Applications }, |
| 146 | + pages={573--578}, |
| 147 | + year={2007} |
| 148 | +} |
| 149 | +``` |
0 commit comments