reweight evaluatinos for coref

Jemoka · Jemoka · commit 69ed9db81032 · 2025-06-25T02:27:27.000+08:00
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -144,9 +144,12 @@ def evaluate(self,
         s_correct = 0
         s_total = 0
 
-        z_correct = 0
+        z_correct = 0 # z_tp
         z_total = 0
 
+        z_fp = 0
+        z_fn = 0
+
 
         with conll.open_(self.config, self.epochs_trained, data_split) \
                 as (gold_f, pred_f):
@@ -159,11 +162,16 @@ def evaluate(self,
                     continue
  
                 res = self.run(doc, run_zeros=True)
-                z_acc = ((res.zeros_scores[res.zeros_y != -100].reshape(-1) > 0.5) ==
-                         (res.zeros_y[res.zeros_y != -100].reshape(-1) == 1))
+
+                z_preds = (res.zeros_scores[res.zeros_y != -100].reshape(-1) > 0.5)
+                z_targets = (res.zeros_y[res.zeros_y != -100].reshape(-1) == 1)
+
+                z_acc = (z_preds == z_targets)
                 z_total += z_acc.size(-1)
                 z_correct += z_acc.sum().item()
-                
+
+                z_fp += (z_preds & (~z_targets)).sum().item()
+                z_fn += ((~z_preds) & z_targets).sum().item()
 
                 if (res.coref_y.argmax(dim=1) == 1).all():
                     logger.warning(f"EVAL: skipping document with no corefs...")
@@ -205,12 +213,15 @@ def evaluate(self,
                     f" p: {s_lea[1]:.5f},"
                     f" r: {s_lea[2]:<.5f}"
                     f" | Z: "
-                    f" acc: {(z_correct / z_total):<.5f}"
+                    f" acc: {(z_correct / z_total):<.5f},"
+                    f" p: {(z_correct / (z_correct + z_fp)):<.5f},"
+                    f" r: {(z_correct / (z_correct + z_fn)):<.5f}"
                 )
             logger.info(f"CoNLL-2012 3-Score Average : {w_checker.bakeoff:.5f}")
 
         return (running_loss / len(docs), *s_checker.total_lea, *w_checker.total_lea, *s_checker.mbc, *w_checker.mbc, 
-                w_checker.bakeoff, s_checker.bakeoff, z_correct / z_total)
+                w_checker.bakeoff, s_checker.bakeoff, z_correct / z_total, (z_correct / (z_correct + z_fp)),
+                (z_correct / (z_correct + z_fn)))
 
     def load_weights(self,
                      path: Optional[str] = None,
@@ -545,7 +556,18 @@ def train(self, log=False):
                 if (res.zeros_y == 1).any():
                     zeros_preds = res.zeros_scores[res.zeros_y != -100].reshape(-1)
                     labels = res.zeros_y[res.zeros_y != -100].reshape(-1)
-                    zeros_loss = F.binary_cross_entropy(zeros_preds, labels.float())
+                    # reweight such that the zeros and nonzeros count for equal weighting
+                    # that is, artifically balance the "number of samples" by weighting between
+                    # them equally
+
+                    weight_each_zero = 0.5/labels.sum()
+                    weight_each_nonzero = 0.5/(labels.size(-1) - labels.sum())
+
+                    weights = torch.empty_like(labels).float()
+                    weights[labels.bool()] = weight_each_zero
+                    weights[~labels.bool()] = weight_each_nonzero
+
+                    zeros_loss = F.binary_cross_entropy(zeros_preds, labels.float(), weight=weights)
                 else:
                     zeros_loss = 0.0 # don't apply loss if there's nothing to learn
 
@@ -595,7 +617,9 @@ def train(self, log=False):
             if log:
                 wandb.log({'dev_score': scores[1]})
                 wandb.log({'dev_bakeoff': scores[-2]})
-                wandb.log({'dev_zeros_acc': scores[-1]})
+                wandb.log({'dev_zeros_acc': scores[-3],
+                           'dev_zeros_p': scores[-2],
+                           'dev_zeros_r': scores[-1]})
 
             if best_f1 is None or scores[1] > best_f1:
 
diff --git a/stanza/pipeline/coref_processor.py b/stanza/pipeline/coref_processor.py
@@ -92,12 +92,7 @@ def process(self, document):
                 sent_ids.append(sent_idx)
                 word_pos.append(word_idx)
 
-        coref_input = {
-            "document_id": "wb_doc_1",
-            "cased_words": cased_words,
-            "sent_id": sent_ids
-        }
-        results = self._model.infer(coref_input)
+        results = self._model.infer(cased_words, sent_ids)
         clusters = []
         for span_cluster in results.span_clusters:
             if len(span_cluster) == 0: