small debugging patches to support empty node prediction

Jemoka · Jemoka · commit 359a2e5e72a4 · 2025-08-12T23:47:46.000-07:00
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -747,10 +747,10 @@ def all_words(self):
         words = self._words
         empty_words = self._empty_words
 
-        all = sorted(words + empty_words, key=lambda x:(x.id,)
-                     if isinstance(x.id, int) else x.id)
+        all_words = sorted(words + empty_words,
+                           key=lambda x:(x.id,) if isinstance(x.id, int) else x.id)
 
-        return all
+        return all_words
 
     @property
     def ents(self):
diff --git a/stanza/models/coref/dataset.py b/stanza/models/coref/dataset.py
@@ -38,9 +38,6 @@ def __init__(self, path, config, tokenizer):
             word2subword = []
             subwords = []
             word_id = []
-            nonblank_subwords = [] # a list of subwords, skipping _
-            previous_was_blank = [] # was the word before _?
-            was_blank = False # a flag to set if we saw "_"
             for i, word in enumerate(doc["cased_words"]):
                 tokenized = self.tokenizer.tokenize(word)
                 if len(tokenized) == 0:
@@ -53,17 +50,6 @@ def __init__(self, path, config, tokenizer):
                 word2subword.append((len(subwords), len(subwords) + len(tokenized_word)))
                 subwords.extend(tokenized_word)
                 word_id.extend([i] * len(tokenized_word))
-                if word == "_":
-                    was_blank = True
-                else:
-                    nonblank_subwords.extend(tokenized_word)
-                    previous_was_blank.extend(
-                        [True if was_blank else False]+[False]*(len(tokenized_word)-1)
-                    )
-                    was_blank = False
-
-            doc["nonblank_subwords"] = nonblank_subwords
-            doc["blank_prefix"] = previous_was_blank
 
             doc["word2subword"] = word2subword
             doc["subwords"] = subwords
diff --git a/stanza/models/coref/model.py b/stanza/models/coref/model.py
@@ -512,13 +512,11 @@ def train(self, log=False):
                 else:
                     s_loss = torch.zeros_like(c_loss)
 
-                del res
-
                 (c_loss + s_loss + z_loss).backward()
 
                 running_c_loss += c_loss.item()
                 running_s_loss += s_loss.item()
-                if z_loss:
+                if res.zero_scores.size(0) != 0:
                     running_z_loss += z_loss.item()
 
                 # log every 100 docs
@@ -527,12 +525,11 @@ def train(self, log=False):
                         'train_c_loss': c_loss.item(),
                         'train_s_loss': s_loss.item(),
                     }
-                    if z_loss:
+                    if res.zero_scores.size(0) != 0:
                         logged['train_z_loss'] = z_loss.item()
                     wandb.log(logged)
 
-
-                del c_loss, s_loss, z_loss
+                del c_loss, s_loss, z_loss, res
 
                 for optim in self.optimizers.values():
                     optim.step()
diff --git a/stanza/pipeline/coref_processor.py b/stanza/pipeline/coref_processor.py
@@ -174,7 +174,7 @@ def process(self, document):
                 
             # if we ended up with no best span, then our "representative text"
             # is just underscore
-            if best_span:
+            if best_span is not None:
                 representative = mentions[best_span]
                 representative_text = extract_text(document, representative.sentence, representative.start_word, representative.end_word)
             else:
@@ -205,7 +205,6 @@ def _handle_zero_anaphora(self, document, results, sent_ids, word_pos):
             cluster_word_ids.extend(cluster)
         
         # Find indices where zero_scores > 0
-        print(zero_scores)
         zero_indices = (zero_scores > 0.0).nonzero()
 
         # this dict maps (cluster_id, word_id) to (cluster_id, start, end)
diff --git a/stanza/utils/datasets/coref/convert_udcoref.py b/stanza/utils/datasets/coref/convert_udcoref.py
@@ -129,7 +129,7 @@ def process_documents(docs, augment=False):
                 if sentence_text[span[1]] == "_" and span[1] == span[2]:
                     is_zero.append([span[0], True])
                     zero = True
-                    # oo! thaht's a zero coref, we should merge it forwards 
+                    # oo! that's a zero coref, we should merge it forwards
                     # i.e. we pick the next word as the head!
                     span = [span[0], span[1]+1, span[2]+1]
                     # crap! there's two zeros right next to each other
@@ -163,7 +163,7 @@ def process_documents(docs, augment=False):
                         # words from 0, so we have to subtract 1 from the stanza heads
                         #print(span, candidate_head, parsed_sentence.words[candidate_head].head - 1)
                         # treat the head of the phrase as the first word that has a head outside the phrase
-                        if parsed_sentence.all_words[candidate_head].head and (
+                        if (parsed_sentence.all_words[candidate_head].head is not None) and (
                                 parsed_sentence.all_words[candidate_head].head - 1 < span[1] or
                                 parsed_sentence.all_words[candidate_head].head - 1 > span[2]
                         ):
@@ -205,7 +205,7 @@ def process_documents(docs, augment=False):
                     [(old_to_new[start], old_to_new[end - 1] + 1) for start, end in cluster]
                     for cluster in span_clusters
                 ]
-            except:
+            except (KeyError, TypeError) as _: # two errors, either end-1 = -1, or start/end is None
                 warnings.warn("Somehow, we are still coreffering to a zero. This is likely due to multiple zeros on top of each other. We are giving up.")
                 continue
             word_clusters = [