Add support for Democrat

Aethor · Aethor · commit e85b8d55d409 · 2023-11-13T10:51:36.000+01:00
diff --git a/tibert/bertcoref.py b/tibert/bertcoref.py
@@ -400,6 +400,7 @@ def from_conll2012_file(
         max_span_size: int,
         tokens_split_idx: int,
         corefs_split_idx: int,
+        separator: str = "\t",
     ) -> CoreferenceDataset:
         """
         :param tokens_split_idx: index of the tokens column in the
@@ -435,7 +436,7 @@ def from_conll2012_file(
                     open_mentions = {}
                     continue
 
-                splitted = line.split("\t")
+                splitted = line.split(separator)
 
                 # - tokens
                 document_tokens.append(splitted[tokens_split_idx])
@@ -453,7 +454,7 @@ def from_conll2012_file(
                 #   - A ending parenthesis indicate the end of a mention
                 #   - The middle number indicates the ID of the coreference chain
                 #     the mention belongs to
-                if splitted[4] == "-":
+                if splitted[corefs_split_idx] == "-":
                     continue
 
                 coref_datas_list = splitted[corefs_split_idx].split("|")
@@ -635,6 +636,21 @@ def load_litbank_dataset(
     )
 
 
+def load_democrat_dataset(
+    root_path: str, tokenizer: PreTrainedTokenizerFast, max_span_size: int
+) -> CoreferenceDataset:
+    "Load the Democrat dataset from the boberle/coreference_databases repository."
+    root_path = os.path.expanduser(root_path.rstrip("/"))
+    return CoreferenceDataset.from_conll2012_file(
+        f"{root_path}/democrat_dem1921/dem1921_base.conll",
+        tokenizer,
+        max_span_size,
+        3,
+        11,
+        separator=" ",
+    )
+
+
 def load_fr_litbank_dataset(
     root_path: str, tokenizer: PreTrainedTokenizerFast, max_span_size: int
 ):
diff --git a/tibert/run_train.py b/tibert/run_train.py
@@ -13,7 +13,7 @@
     train_coref_model,
     load_train_checkpoint,
 )
-from tibert.bertcoref import CoreferenceDataset
+from tibert.bertcoref import CoreferenceDataset, load_democrat_dataset
 
 ex = Experiment()
 
@@ -22,7 +22,7 @@
 def config():
     batch_size: int = 1
     epochs_nb: int = 30
-    # either "litbank" or "fr-litbank"
+    # either "litbank", "fr-litbank" or "democrat"
     dataset_name: str = "litbank"
     dataset_path: str = os.path.expanduser("~/litbank")
     mentions_per_tokens: float = 0.4
@@ -45,7 +45,7 @@ def main(
     _run: Run,
     batch_size: int,
     epochs_nb: int,
-    dataset_name: Literal["litbank", "fr-litbank"],
+    dataset_name: Literal["litbank", "fr-litbank", "democrat"],
     dataset_path: str,
     mentions_per_tokens: float,
     antecedents_nb: int,
@@ -74,6 +74,11 @@ def main(
             "tokenizer_class": CamembertTokenizerFast,
             "loading_function": load_fr_litbank_dataset,
         },
+        "democrat": {
+            "model_class": CamembertForCoreferenceResolution,
+            "tokenizer_class": CamembertTokenizerFast,
+            "loading_function": load_democrat_dataset,
+        },
     }
 
     if not dataset_name in dataset_configs: