Support Multi-Supervised All Datasets

w11wo · w11wo · commit 85dd8931b760 · 2024-01-25T09:53:02.000Z
diff --git a/training/all/MultiDatasetDataLoader.py b/training/all/MultiDatasetDataLoader.py
@@ -0,0 +1,76 @@
+# Modified from: https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/paraphrases/MultiDatasetDataLoader.py
+
+import math
+import logging
+import random
+
+
+class MultiDatasetDataLoader:
+    def __init__(self, datasets, batch_size_pairs, batch_size_triplets=None, dataset_size_temp=-1):
+        self.allow_swap = True
+        self.batch_size_pairs = batch_size_pairs
+        self.batch_size_triplets = batch_size_pairs if batch_size_triplets is None else batch_size_triplets
+
+        # Compute dataset weights
+        self.dataset_lengths = list(map(len, datasets))
+        self.dataset_lengths_sum = sum(self.dataset_lengths)
+
+        weights = []
+        if dataset_size_temp > 0:  # Scale probability with dataset size
+            for dataset in datasets:
+                prob = len(dataset) / self.dataset_lengths_sum
+                weights.append(max(1, int(math.pow(prob, 1 / dataset_size_temp) * 1000)))
+        else:  # Equal weighting of all datasets
+            weights = [100] * len(datasets)
+
+        logging.info("Dataset lengths and weights: {}".format(list(zip(self.dataset_lengths, weights))))
+
+        self.dataset_idx = []
+        self.dataset_idx_pointer = 0
+
+        for idx, weight in enumerate(weights):
+            self.dataset_idx.extend([idx] * weight)
+        random.shuffle(self.dataset_idx)
+
+        self.datasets = []
+        for dataset in datasets:
+            random.shuffle(dataset)
+            self.datasets.append(
+                {
+                    "elements": dataset,
+                    "pointer": 0,
+                }
+            )
+
+    def __iter__(self):
+        for _ in range(int(self.__len__())):
+            # Select dataset
+            if self.dataset_idx_pointer >= len(self.dataset_idx):
+                self.dataset_idx_pointer = 0
+                random.shuffle(self.dataset_idx)
+
+            dataset_idx = self.dataset_idx[self.dataset_idx_pointer]
+            self.dataset_idx_pointer += 1
+
+            # Select batch from this dataset
+            dataset = self.datasets[dataset_idx]
+            batch_size = self.batch_size_pairs if len(dataset["elements"][0].texts) == 2 else self.batch_size_triplets
+
+            batch = []
+            while len(batch) < batch_size:
+                example = dataset["elements"][dataset["pointer"]]
+
+                if self.allow_swap and random.random() > 0.5:
+                    example.texts[0], example.texts[1] = example.texts[1], example.texts[0]
+
+                batch.append(example)
+
+                dataset["pointer"] += 1
+                if dataset["pointer"] >= len(dataset["elements"]):
+                    dataset["pointer"] = 0
+                    random.shuffle(dataset["elements"])
+
+            yield self.collate_fn(batch) if self.collate_fn is not None else batch
+
+    def __len__(self):
+        return int(self.dataset_lengths_sum / self.batch_size_pairs)
diff --git a/training/all/README.md b/training/all/README.md
@@ -0,0 +1,45 @@
+# All Supervised Datasets
+
+Inspired by [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2), we fine-tuned Indonesian sentence embedding models on a set of existing supervised datasets. The tasks included in the training dataset are: question-answering, textual entailment, retrieval, commonsense reasoning, and natural language inference. Currently, our script simply concatenates these datasets and our models are trained conventionally using the `MultipleNegativesRankingLoss`.
+
+## Training Data
+
+| Dataset   |            Task            |   Type   | Number of Training Tuples |
+| --------- | :------------------------: | :------: | :-----------------------: |
+| indonli   | Natural Language Inference | triplets |           3,914           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+|           |                            |          |                           |
+| **Total** |                            |          |        **135,258**        |
+
+## All Supervised Datasets with MultipleNegativesRankingLoss
+
+### IndoBERT Base
+
+```sh
+python train_all_mnrl.py \
+    --model-name indobenchmark/indobert-base-p1 \
+    --max-seq-length 128 \
+    --num-epochs 5 \
+    --train-batch-size-pairs 384 \
+    --train-batch-size-triplets 256 \
+    --learning-rate 2e-5
+```
+
+## References
+
+```bibtex
+@inproceedings{reimers-2019-sentence-bert,
+  title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
+  author = "Reimers, Nils and Gurevych, Iryna",
+  booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
+  month = "11",
+  year = "2019",
+  publisher = "Association for Computational Linguistics",
+  url = "https://arxiv.org/abs/1908.10084",
+}
+```
diff --git a/training/all/all_datasets.py b/training/all/all_datasets.py
@@ -0,0 +1,196 @@
+from typing import List
+from dataclasses import dataclass
+import random
+
+from datasets import load_dataset
+from sentence_transformers import InputExample
+
+##############
+# PAIRS
+##############
+
+
+@dataclass
+class WReTE:
+    dataset = load_dataset("SEACrowd/wrete", split="train", trust_remote_code=True)
+    # filter for entailment pairs
+    dataset = dataset.filter(lambda example: example["label"] == "Entail_or_Paraphrase")
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in WReTE.dataset:
+            train_samples.append(InputExample(texts=[datum["sent_A"], datum["sent_B"]]))
+
+        return train_samples
+
+
+@dataclass
+class IndoLEMNTP:
+    dataset = load_dataset("SEACrowd/indolem_ntp", split="train", trust_remote_code=True)
+    # filter for entailment pairs
+    dataset = dataset.filter(lambda example: example["label"] == 1)
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in IndoLEMNTP.dataset:
+            train_samples.append(InputExample(texts=[datum["tweets"], datum["next_tweet"]]))
+
+        return train_samples
+
+
+@dataclass
+class TyDiQA:
+    dataset = load_dataset("khalidalt/tydiqa-goldp", "indonesian", split="train", trust_remote_code=True).shuffle(
+        seed=42
+    )
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in TyDiQA.dataset:
+            train_samples.append(InputExample(texts=[datum["question_text"], datum["passage_text"]]))
+            train_samples.append(InputExample(texts=[datum["question_text"], datum["answers"]["text"][0]]))
+
+        return train_samples
+
+
+@dataclass
+class FacQA:
+    dataset = load_dataset("SEACrowd/facqa", split="train", trust_remote_code=True)
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in FacQA.dataset:
+            question = " ".join(datum["question"])
+            passage = " ".join(datum["passage"])
+            answer = " ".join(t for t, l in zip(datum["passage"], datum["seq_label"]) if l != "O")
+
+            train_samples.append(InputExample(texts=[question, passage]))
+            train_samples.append(InputExample(texts=[question, answer]))
+
+        return train_samples
+
+
+##############
+# TRIPLETS
+##############
+
+
+@dataclass
+class mMARCO:
+    dataset = load_dataset("unicamp-dl/mmarco", "indonesian", split="train", trust_remote_code=True)
+    # limit to only 100,000 rows
+    dataset = dataset.shuffle(seed=42).select(range(100_000))
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in mMARCO.dataset:
+            train_samples.append(
+                InputExample(
+                    texts=[
+                        datum["query"],
+                        datum["positive"],
+                        datum["negative"],
+                    ]
+                )
+            )
+
+        return train_samples
+
+
+@dataclass
+class MIRACL:
+    dataset = load_dataset("miracl/miracl", "id", split="train", trust_remote_code=True)
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in MIRACL.dataset:
+            query = datum["query"]
+            positives = [doc["text"] for doc in datum["positive_passages"]]
+            negatives = [doc["text"] for doc in datum["negative_passages"]]
+
+            if len(negatives) > 0:
+                train_samples.append(InputExample(texts=[query, random.choice(positives), random.choice(negatives)]))
+                train_samples.append(InputExample(texts=[random.choice(positives), query, random.choice(negatives)]))
+
+        return train_samples
+
+
+@dataclass
+class IndoStoryCloze:
+    dataset = load_dataset("indolem/indo_story_cloze", split="train", trust_remote_code=True)
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        train_samples = []
+
+        for datum in IndoStoryCloze.dataset:
+            context = ". ".join([datum["sentence-1"], datum["sentence-2"], datum["sentence-3"], datum["sentence-4"]])
+            train_samples.append(
+                InputExample(
+                    texts=[
+                        context,
+                        datum["correct_ending"],
+                        datum["incorrect_ending"],
+                    ]
+                )
+            )
+
+        return train_samples
+
+
+@dataclass
+class IndoNLI:
+    dataset = load_dataset("indonli", split="train", trust_remote_code=True)
+    id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
+
+    @staticmethod
+    def train_samples() -> List[InputExample]:
+        def add_to_samples(sent1, sent2, label):
+            if sent1 not in train_data:
+                train_data[sent1] = {"contradiction": set(), "entailment": set(), "neutral": set()}
+            train_data[sent1][label].add(sent2)
+
+        train_data = {}
+        train_samples = []
+
+        for datum in IndoNLI.dataset:
+            sent1 = datum["premise"].strip()
+            sent2 = datum["hypothesis"].strip()
+
+            add_to_samples(sent1, sent2, IndoNLI.id2label[datum["label"]])
+            add_to_samples(sent2, sent1, IndoNLI.id2label[datum["label"]])  # Also add the opposite
+
+        for sent1, others in train_data.items():
+            if len(others["entailment"]) > 0 and len(others["contradiction"]) > 0:
+                train_samples.append(
+                    InputExample(
+                        texts=[
+                            sent1,
+                            random.choice(list(others["entailment"])),
+                            random.choice(list(others["contradiction"])),
+                        ]
+                    )
+                )
+                train_samples.append(
+                    InputExample(
+                        texts=[
+                            random.choice(list(others["entailment"])),
+                            sent1,
+                            random.choice(list(others["contradiction"])),
+                        ]
+                    )
+                )
+
+        return train_samples
diff --git a/training/all/train_all_mnrl.py b/training/all/train_all_mnrl.py