Add a lang=he field to all of the documents produced by the Hebrew coref conversion

AngledLuffa · AngledLuffa · commit c55d3d30243d · 2025-10-04T22:57:19.000-07:00
diff --git a/stanza/utils/datasets/coref/convert_hebrew_iahlt.py b/stanza/utils/datasets/coref/convert_hebrew_iahlt.py
@@ -156,7 +156,7 @@ def main():
         input_filename = os.path.join(hebrew_base_path, input_filename)
         assert os.path.exists(input_filename)
         docs = read_doc(tokenizer, input_filename)
-        dataset = [process_document(pipe, doc.doc_id, "", doc.sentences, doc.coref_spans, None) for doc in tqdm(docs)]
+        dataset = [process_document(pipe, doc.doc_id, "", doc.sentences, doc.coref_spans, None, lang="he") for doc in tqdm(docs)]
 
         output_filename = os.path.join(paths["COREF_DATA_DIR"], output_filename)
         write_json_file(output_filename, dataset)
diff --git a/stanza/utils/datasets/coref/utils.py b/stanza/utils/datasets/coref/utils.py
@@ -56,7 +56,7 @@ def find_cconj_head(heads, upos, start, end):
         return cc_indexes[0] + start
     return None
 
-def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers, use_cconj_heads=True):
+def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_speakers, use_cconj_heads=True, lang=None):
     """
     doc_id: a string naming the document
     part_id: if the document has a particular subpart (can be blank)
@@ -154,4 +154,6 @@ def process_document(pipe, doc_id, part_id, sentences, coref_spans, sentence_spe
     }
     if part_id is not None:
         processed["part_id"] = part_id
+    if lang is not None:
+        processed["lang"] = lang
     return processed