ok, in theory this is what we need for zeros

Jemoka · Jemoka · commit a4f7db485a1b · 2025-07-31T00:46:13.000+02:00
diff --git a/stanza/utils/datasets/coref/convert_udcoref.py b/stanza/utils/datasets/coref/convert_udcoref.py
@@ -10,6 +10,7 @@
 
 from stanza.utils.conll import CoNLL
 
+import warnings
 from random import Random
 
 import argparse
@@ -71,6 +72,7 @@ def process_documents(docs, augment=False):
         is_zero = []
         word_total = 0
         SPANS = re.compile(r"(\(\w+|[%\w]+\))")
+        do_ctn = False # if we broke in the loop
         for parsed_sentence in doc.sentences:
             # spans regex
             # parse the misc column, leaving on "Entity" entries
@@ -130,6 +132,12 @@ def process_documents(docs, augment=False):
                     # oo! thaht's a zero coref, we should merge it forwards 
                     # i.e. we pick the next word as the head!
                     span = [span[0], span[1]+1, span[2]+1]
+                    # crap! there's two zeros right next to each other
+                    # we are sad and confused so we give up in this case
+                    if len(sentence_text) > span[1] and sentence_text[span[1]] == "_":
+                        warnings.warn("Found two zeros next to each other in sequence; we are confused and therefore giving up.")
+                        do_ctn = True
+                        break
                 else:
                     is_zero.append([span[0], False])
 
@@ -169,12 +177,46 @@ def process_documents(docs, augment=False):
                 span_clusters[span[0]].append((span_start, span_end))
                 word_clusters[span[0]].append(candidate_head)
                 head2span.append((candidate_head, span_start, span_end))
+            if do_ctn:
+                break
             word_total += len(parsed_sentence.all_words)
+        if do_ctn:
+            continue
         span_clusters = sorted([sorted(values) for _, values in span_clusters.items()])
         word_clusters = sorted([sorted(values) for _, values in word_clusters.items()])
         head2span = sorted(head2span)
         is_zero = [i for _,i in sorted(is_zero)]
 
+        # remove zero tokens "_" from cased_words and adjust indices accordingly
+        zero_positions = [i for i, w in enumerate(cased_words) if w == "_"]
+        if zero_positions:
+            old_to_new = {}
+            new_idx = 0
+            for old_idx, w in enumerate(cased_words):
+                if w != "_":
+                    old_to_new[old_idx] = new_idx
+                    new_idx += 1
+            cased_words = [w for w in cased_words if w != "_"]
+            sent_id = [sent_id[i] for i in sorted(old_to_new.keys())]
+            deprel = [deprel[i] for i in sorted(old_to_new.keys())]
+            heads = [heads[i] for i in sorted(old_to_new.keys())]
+            try:
+                span_clusters = [
+                    [(old_to_new[start], old_to_new[end - 1] + 1) for start, end in cluster]
+                    for cluster in span_clusters
+                ]
+            except:
+                warnings.warn("Somehow, we are still coreffering to a zero. This is likely due to multiple zeros on top of each other. We are giving up.")
+                continue
+            word_clusters = [
+                [old_to_new[h] for h in cluster]
+                for cluster in word_clusters
+            ]
+            head2span = [
+                (old_to_new[h], old_to_new[s], old_to_new[e - 1] + 1)
+                for h, s, e in head2span
+            ]
+
         processed = {
             "document_id": doc_id,
             "cased_words": cased_words,
@@ -338,4 +380,3 @@ def main():
         ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"],
         ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"]
     )
-