Fix erroneous metrics reporting in training

Aethor · Aethor · commit 1a2dbc8a6e1b · 2023-08-19T16:11:47.000+02:00
diff --git a/tibert/train.py b/tibert/train.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Optional, Union, Literal
 import traceback, copy
 from statistics import mean
 from more_itertools.recipes import flatten
@@ -8,14 +8,14 @@
 from tqdm import tqdm
 from tibert import (
     BertForCoreferenceResolution,
-    BertCoreferenceResolutionOutput,
     CamembertForCoreferenceResolution,
     CoreferenceDataset,
     split_coreference_document,
     DataCollatorForSpanClassification,
     score_coref_predictions,
     score_mention_detection,
 )
+from tibert.predict import predict_coref
 from tibert.utils import gpu_memory_usage
 
 
@@ -29,9 +29,12 @@ def train_coref_model(
     bert_lr: float = 1e-5,
     task_lr: float = 2e-4,
     model_save_path: Optional[str] = None,
+    device_str: Literal["cpu", "cuda", "auto"] = "auto",
     _run: Optional["sacred.run.Run"] = None,
 ) -> BertForCoreferenceResolution:
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if device_str == "auto":
+        device_str = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device_str)
 
     train_dataset = CoreferenceDataset(
         dataset.documents[: int(0.9 * len(dataset))],
@@ -68,9 +71,6 @@ def train_coref_model(
     train_dataloader = DataLoader(
         train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator
     )
-    test_dataloader = DataLoader(
-        test_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_collator
-    )
 
     optimizer = torch.optim.AdamW(
         [
@@ -88,7 +88,7 @@ def train_coref_model(
 
     model = model.to(device)
 
-    for epoch_i in range(epochs_nb):
+    for _ in range(epochs_nb):
         model = model.train()
 
         epoch_losses = []
@@ -120,73 +120,48 @@ def train_coref_model(
         if _run:
             _run.log_scalar("epoch_mean_loss", mean(epoch_losses))
 
-        # metrics computation
-        model = model.eval()
-
-        with torch.no_grad():
-            try:
-                preds = []
-                losses = []
-
-                for batch in tqdm(test_dataloader):
-                    local_batch_size = batch["input_ids"].shape[0]
-                    batch = batch.to(device)
-                    out: BertCoreferenceResolutionOutput = model(**batch)
-                    batch_preds = out.coreference_documents(
-                        [
-                            [tokenizer.decode(t) for t in batch["input_ids"][i]]
-                            for i in range(local_batch_size)
-                        ]
-                    )
-                    preds += batch_preds
-
-                    assert not out.loss is None
-                    losses.append(out.loss.item())
-
-                _ = _run and _run.log_scalar("epoch_mean_test_loss", mean(losses))
-
-                refs = [
-                    doc.prepared_document(
-                        test_dataset.tokenizer, model.config.max_span_size
-                    )[0]
-                    for doc in test_dataset.documents
-                ]
-
-                metrics = score_coref_predictions(preds, refs)
-                conll_f1 = mean(
-                    [metrics["MUC"]["f1"], metrics["B3"]["f1"], metrics["CEAF"]["f1"]]
-                )
-                if _run:
-                    _run.log_scalar("muc_precision", metrics["MUC"]["precision"])
-                    _run.log_scalar("muc_recall", metrics["MUC"]["recall"])
-                    _run.log_scalar("muc_f1", metrics["MUC"]["f1"])
-                    _run.log_scalar("b3_precision", metrics["B3"]["precision"])
-                    _run.log_scalar("b3_recall", metrics["B3"]["recall"])
-                    _run.log_scalar("b3_f1", metrics["B3"]["f1"])
-                    _run.log_scalar("ceaf_precision", metrics["CEAF"]["precision"])
-                    _run.log_scalar("ceaf_recall", metrics["CEAF"]["recall"])
-                    _run.log_scalar("ceaf_f1", metrics["CEAF"]["f1"])
-                    _run.log_scalar("conll_f1", conll_f1)
-                print(metrics)
-
-                m_precision, m_recall, m_f1 = score_mention_detection(preds, refs)
-                if _run:
-                    _run.log_scalar("mention_detection_precision", m_precision)
-                    _run.log_scalar("mention_detection_recall", m_recall)
-                    _run.log_scalar("mention_detection_f1", m_f1)
-                print(
-                    f"mention detection metrics: (precision: {m_precision}, recall: {m_recall}, f1: {m_f1})"
-                )
+        # Metrics Computation
+        # -------------------
+        preds = predict_coref(
+            [doc.tokens for doc in test_dataset.documents],
+            model,
+            tokenizer,
+            batch_size=batch_size,
+            device_str=device_str,
+        )
+        metrics = score_coref_predictions(preds, test_dataset.documents)
 
-            except Exception as e:
-                print(e)
-                traceback.print_exc()
-                conll_f1 = 0
+        conll_f1 = mean(
+            [metrics["MUC"]["f1"], metrics["B3"]["f1"], metrics["CEAF"]["f1"]]
+        )
+        if _run:
+            _run.log_scalar("muc_precision", metrics["MUC"]["precision"])
+            _run.log_scalar("muc_recall", metrics["MUC"]["recall"])
+            _run.log_scalar("muc_f1", metrics["MUC"]["f1"])
+            _run.log_scalar("b3_precision", metrics["B3"]["precision"])
+            _run.log_scalar("b3_recall", metrics["B3"]["recall"])
+            _run.log_scalar("b3_f1", metrics["B3"]["f1"])
+            _run.log_scalar("ceaf_precision", metrics["CEAF"]["precision"])
+            _run.log_scalar("ceaf_recall", metrics["CEAF"]["recall"])
+            _run.log_scalar("ceaf_f1", metrics["CEAF"]["f1"])
+            _run.log_scalar("conll_f1", conll_f1)
+        print(metrics)
+
+        m_precision, m_recall, m_f1 = score_mention_detection(
+            preds, test_dataset.documents
+        )
+        if _run:
+            _run.log_scalar("mention_detection_precision", m_precision)
+            _run.log_scalar("mention_detection_recall", m_recall)
+            _run.log_scalar("mention_detection_f1", m_f1)
+        print(
+            f"mention detection metrics: (precision: {m_precision}, recall: {m_recall}, f1: {m_f1})"
+        )
 
-            if conll_f1 > best_f1 or best_f1 == 0:
-                best_model = copy.deepcopy(model).to("cpu")
-                if not model_save_path is None:
-                    best_model.save_pretrained(model_save_path)
-                best_f1 = conll_f1
+        if conll_f1 > best_f1 or best_f1 == 0:
+            best_model = copy.deepcopy(model).to("cpu")
+            if not model_save_path is None:
+                best_model.save_pretrained(model_save_path)
+            best_f1 = conll_f1
 
     return best_model