Add mention detection scoring. Report it and conll f1 during training

Aethor · Aethor · commit b05e06fb2f8a · 2023-08-18T18:18:19.000+02:00
diff --git a/tibert/score.py b/tibert/score.py
@@ -5,7 +5,7 @@
 from tibert.utils import spans_indexs
 
 if TYPE_CHECKING:
-    from tibert.bertcoref import CoreferenceDocument
+    from tibert.bertcoref import CoreferenceDocument, Mention
 
 
 def score_coref_predictions(
@@ -133,3 +133,57 @@ def precisions_recalls_f1s(
             "f1": mean(ceaf_f1s),
         },
     }
+
+
+def doc_mentions(doc: CoreferenceDocument) -> List[Mention]:
+    return [mention for chain in doc.coref_chains for mention in chain]
+
+
+def score_mention_detection(
+    preds: List[CoreferenceDocument], refs: List[CoreferenceDocument]
+) -> Tuple[float, float, float]:
+    """Compute mention detection precision, recall and F1.
+
+    :param preds: predictions
+    :param refs: references
+
+    :return: ``(precision, recall, f1)``
+    """
+    assert len(preds) > 0
+    assert len(refs) > 0
+
+    precision_l = []
+    recall_l = []
+    f1_l = []
+
+    for pred, ref in zip(preds, refs):
+
+        pred_mentions = doc_mentions(pred)
+        ref_mentions = doc_mentions(ref)
+
+        if len(pred_mentions) == 0:
+            continue
+        precision = len([m for m in pred_mentions if m in ref_mentions]) / len(
+            pred_mentions
+        )
+
+        if len(ref_mentions) == 0:
+            continue
+        recall = len([m for m in ref_mentions if m in pred_mentions]) / len(
+            ref_mentions
+        )
+
+        if precision + recall == 0:
+            continue
+
+        f1 = 2 * (precision * recall) / (precision + recall)
+
+        precision_l.append(precision)
+        recall_l.append(recall)
+        f1_l.append(f1)
+
+    if len(f1_l) == 0:
+        print("[warning] undefined F1 for all samples")
+        return (0.0, 0.0, 0.0)
+
+    return (mean(precision_l), mean(recall_l), mean(f1_l))
diff --git a/tibert/train.py b/tibert/train.py
@@ -4,7 +4,7 @@
 from more_itertools.recipes import flatten
 import torch
 from torch.utils.data.dataloader import DataLoader
-from transformers import BertTokenizerFast  # type: ignore
+from transformers import BertTokenizerFast, CamembertTokenizerFast  # type: ignore
 from tqdm import tqdm
 from tibert import (
     BertForCoreferenceResolution,
@@ -14,14 +14,15 @@
     split_coreference_document,
     DataCollatorForSpanClassification,
     score_coref_predictions,
+    score_mention_detection,
 )
 from tibert.utils import gpu_memory_usage
 
 
 def train_coref_model(
     model: Union[BertForCoreferenceResolution, CamembertForCoreferenceResolution],
     dataset: CoreferenceDataset,
-    tokenizer: BertTokenizerFast,
+    tokenizer: Union[BertTokenizerFast, CamembertTokenizerFast],
     batch_size: int = 1,
     epochs_nb: int = 30,
     sents_per_documents_train: int = 11,
@@ -150,8 +151,11 @@ def train_coref_model(
                     )[0]
                     for doc in test_dataset.documents
                 ]
-                metrics = score_coref_predictions(preds, refs)
 
+                metrics = score_coref_predictions(preds, refs)
+                conll_f1 = mean(
+                    [metrics["MUC"]["f1"], metrics["B3"]["f1"], metrics["CEAF"]["f1"]]
+                )
                 if _run:
                     _run.log_scalar("muc_precision", metrics["MUC"]["precision"])
                     _run.log_scalar("muc_recall", metrics["MUC"]["recall"])
@@ -162,23 +166,27 @@ def train_coref_model(
                     _run.log_scalar("ceaf_precision", metrics["CEAF"]["precision"])
                     _run.log_scalar("ceaf_recall", metrics["CEAF"]["recall"])
                     _run.log_scalar("ceaf_f1", metrics["CEAF"]["f1"])
-
+                    _run.log_scalar("conll_f1", conll_f1)
                 print(metrics)
 
-                # keep the best model
-                model_f1 = mean(
-                    [metrics["MUC"]["f1"], metrics["B3"]["f1"], metrics["CEAF"]["f1"]]
+                m_precision, m_recall, m_f1 = score_mention_detection(preds, refs)
+                if _run:
+                    _run.log_scalar("mention_detection_precision", m_precision)
+                    _run.log_scalar("mention_detection_recall", m_recall)
+                    _run.log_scalar("mention_detection_f1", m_f1)
+                print(
+                    f"mention detection metrics: (precision: {m_precision}, recall: {m_recall}, f1: {m_f1})"
                 )
 
             except Exception as e:
                 print(e)
                 traceback.print_exc()
-                model_f1 = 0
+                conll_f1 = 0
 
-            if model_f1 > best_f1 or best_f1 == 0:
+            if conll_f1 > best_f1 or best_f1 == 0:
                 best_model = copy.deepcopy(model).to("cpu")
                 if not model_save_path is None:
                     best_model.save_pretrained(model_save_path)
-                best_f1 = model_f1
+                best_f1 = conll_f1
 
     return best_model