wip underscore converter

Jemoka · Jemoka · commit 59a0036ceed6 · 2025-07-31T00:07:52.000+02:00
diff --git a/stanza/utils/datasets/coref/convert_udcoref.py b/stanza/utils/datasets/coref/convert_udcoref.py
@@ -22,6 +22,7 @@
 UDCOREF_ADDN = 0 if not IS_UDCOREF_FORMAT else 1
 
 def process_documents(docs, augment=False):
+    # docs = sections
     processed_section = []
 
     for idx, (doc, doc_id, lang) in enumerate(tqdm(docs)):
@@ -67,6 +68,7 @@ def process_documents(docs, augment=False):
         span_clusters = defaultdict(list)
         word_clusters = defaultdict(list)
         head2span = []
+        is_zero = []
         word_total = 0
         SPANS = re.compile(r"(\(\w+|[%\w]+\))")
         for parsed_sentence in doc.sentences:
@@ -114,8 +116,23 @@ def process_documents(docs, augment=False):
                     coref_spans.append([int(k), i[0], i[1]])
             sentence_upos = [x.upos for x in parsed_sentence.all_words]
             sentence_heads = [x.head - 1 if x.head and x.head > 0 else None for x in parsed_sentence.all_words]
+            sentence_text = [x.text for x in parsed_sentence.all_words]
+
+            # if "_" in sentence_text and sentence_text.index("_") in [j for i in coref_spans for j in i]:
+            #     import ipdb
+            #     ipdb.set_trace()
 
             for span in coref_spans:
+                zero = False
+                if sentence_text[span[1]] == "_" and span[1] == span[2]:
+                    is_zero.append([span[0], True])
+                    zero = True
+                    # oo! thaht's a zero coref, we should merge it forwards 
+                    # i.e. we pick the next word as the head!
+                    span = [span[0], span[1]+1, span[2]+1]
+                else:
+                    is_zero.append([span[0], False])
+
                 # input is expected to be start word, end word + 1
                 # counting from 0
                 # whereas the OntoNotes coref_span is [start_word, end_word] inclusive
@@ -124,10 +141,13 @@ def process_documents(docs, augment=False):
                 # if its a zero coref (i.e. coref, but the head in None), we call
                 # the beginning of the span (i.e. the zero itself) the head
 
-                try:
-                    candidate_head = find_cconj_head(sentence_heads, sentence_upos, span[1], span[2]+1)
-                except RecursionError:
+                if zero:
                     candidate_head = span[1]
+                else:
+                    try:
+                        candidate_head = find_cconj_head(sentence_heads, sentence_upos, span[1], span[2]+1)
+                    except RecursionError:
+                        candidate_head = span[1]
                     
                 if candidate_head is None:
                     for candidate_head in range(span[1], span[2] + 1):
@@ -153,6 +173,7 @@ def process_documents(docs, augment=False):
         span_clusters = sorted([sorted(values) for _, values in span_clusters.items()])
         word_clusters = sorted([sorted(values) for _, values in word_clusters.items()])
         head2span = sorted(head2span)
+        is_zero = [i for _,i in sorted(is_zero)]
 
         processed = {
             "document_id": doc_id,
@@ -165,7 +186,8 @@ def process_documents(docs, augment=False):
             "span_clusters": span_clusters,
             "word_clusters": word_clusters,
             "head2span": head2span,
-            "lang": lang
+            "lang": lang,
+            "is_zero": is_zero
         }
         processed_section.append(processed)
     return processed_section
@@ -183,6 +205,7 @@ def process_dataset(short_name, coref_output_path, split_test, train_files, dev_
             lang = load.split("/")[-1].split("_")[0]
             print("Ingesting %s from %s of lang %s" % (section, load, lang))
             docs = CoNLL.conll2multi_docs(load, ignore_gapping=False)
+            # sections = docs[:10]
             print("  Ingested %d documents" % len(docs))
             if split_test and section == 'train':
                 test_section = []
@@ -302,5 +325,17 @@ def main():
     process_dataset(project, coref_output_path, args.split_test, train_filenames, dev_filenames)
 
 if __name__ == '__main__':
-    main()
+    # main()
+
+    project = "test"
+
+    paths = get_default_paths()
+    coref_output_path = paths['COREF_DATA_DIR']
+    process_dataset(
+        project,
+        coref_output_path,
+        False,
+        ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"],
+        ["./extern_data/coref/corefud_v1_3/hu_szegedkoref-corefud-dev.conllu"]
+    )