Add a version of the HE coref conversion script which mixes in udcoref, which seems to help with the results

AngledLuffa · AngledLuffa · commit be2f434b83d7 · 2025-10-04T22:57:19.000-07:00
diff --git a/stanza/utils/datasets/coref/convert_hebrew_iahlt.py b/stanza/utils/datasets/coref/convert_hebrew_iahlt.py
@@ -21,6 +21,7 @@
 52 F1, whereas if we use roberta-xlm, we get 50.
 """
 
+import argparse
 from collections import defaultdict, namedtuple
 import json
 import os
@@ -141,8 +142,15 @@ def write_json_file(output_filename, dataset):
     with open(output_filename, "w", encoding="utf-8") as fout:
         json.dump(dataset, fout, indent=2, ensure_ascii=False)
 
-def main():
+def main(args=None):
     paths = get_default_paths()
+    parser = argparse.ArgumentParser(
+        prog='Convert Hebrew IAHLT data',
+    )
+    parser.add_argument('--output_directory', default=None, type=str, help='Where to output the data (defaults to %s)' % paths['COREF_DATA_DIR'])
+    args = parser.parse_args(args=args)
+    coref_output_path = args.output_directory if args.output_directory else paths['COREF_DATA_DIR']
+    print("Will write IAHLT dataset to %s" % coref_output_path)
 
     coref_input_path = paths["COREF_BASE"]
     hebrew_base_path = os.path.join(coref_input_path, "hebrew", "coref", "train_val_test")
@@ -158,8 +166,10 @@ def main():
         docs = read_doc(tokenizer, input_filename)
         dataset = [process_document(pipe, doc.doc_id, "", doc.sentences, doc.coref_spans, None, lang="he") for doc in tqdm(docs)]
 
-        output_filename = os.path.join(paths["COREF_DATA_DIR"], output_filename)
+        output_filename = os.path.join(coref_output_path, output_filename)
         write_json_file(output_filename, dataset)
 
+    return output_files
+
 if __name__ == '__main__':
     main()
diff --git a/stanza/utils/datasets/coref/convert_hebrew_mixed.py b/stanza/utils/datasets/coref/convert_hebrew_mixed.py
@@ -0,0 +1,46 @@
+"""
+Build a dataset mixed with IAHLT Hebrew and UD Coref
+
+We find that the IAHLT dataset by itself, trained using Stanza 1.11
+with xlm-roberta-large and a lora finetuning layer, gets 49.7 F1.
+This is a bit lower than the value the IAHLT group originally had, as
+they reported 52.  Interestingly, we find that mixing in the 1.3 UD
+Coref improves results, getting 51.7 under the same parameters
+
+This script runs the IAHLT conversion and the UD Coref conversion,
+then combines the files into one big training file
+"""
+
+import json
+import os
+import shutil
+import tempfile
+
+from stanza.utils.datasets.coref import convert_hebrew_iahlt
+from stanza.utils.datasets.coref import convert_udcoref
+from stanza.utils.default_paths import get_default_paths
+
+def main():
+    paths = get_default_paths()
+    coref_output_path = paths['COREF_DATA_DIR']
+    with tempfile.TemporaryDirectory() as temp_dir_path:
+        hebrew_filenames = convert_hebrew_iahlt.main(["--output_directory", temp_dir_path])
+        udcoref_filenames = convert_udcoref.main(["--project", "gerrom", "--output_directory", temp_dir_path])
+
+        with open(os.path.join(temp_dir_path, hebrew_filenames[0]), encoding="utf-8") as fin:
+            hebrew_train = json.load(fin)
+        udcoref_train_filename = os.path.join(temp_dir_path, udcoref_filenames[0])
+        with open(udcoref_train_filename, encoding="utf-8") as fin:
+            print("Reading extra udcoref json data from %s" % udcoref_train_filename)
+            udcoref_train = json.load(fin)
+        mixed_train = hebrew_train + udcoref_train
+        with open(os.path.join(coref_output_path, "he_mixed.train.json"), "w", encoding="utf-8") as fout:
+            json.dump(mixed_train, fout, indent=2, ensure_ascii=False))
+
+        shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[1]),
+                        os.path.join(coref_output_path, "he_mixed.dev.json"))
+        shutil.copyfile(os.path.join(temp_dir_path, hebrew_filenames[2]),
+                        os.path.join(coref_output_path, "he_mixed.test.json"))
+
+if __name__ == '__main__':
+    main()
diff --git a/stanza/utils/datasets/coref/convert_udcoref.py b/stanza/utils/datasets/coref/convert_udcoref.py
@@ -282,13 +282,16 @@ def process_dataset(short_name, coref_output_path, split_test, train_files, dev_
         sections.append(full_test_section)
 
 
+    output_filenames = []
     for section_data, section_name in zip(sections, section_names):
         converted_section = process_documents(section_data, augment=(section_name=="train"))
 
         os.makedirs(coref_output_path, exist_ok=True)
-        output_filename = os.path.join(coref_output_path, "%s.%s.json" % (short_name, section_name))
+        output_filenames.append("%s.%s.json" % (short_name, section_name))
+        output_filename = os.path.join(coref_output_path, output_filenames[-1])
         with open(output_filename, "w", encoding="utf-8") as fout:
             json.dump(converted_section, fout, indent=2)
+    return output_filenames
 
 def get_dataset_by_language(coref_input_path, langs):
     conll_path = os.path.join(coref_input_path, "CorefUD-1.3-public", "data")
@@ -301,21 +304,22 @@ def get_dataset_by_language(coref_input_path, langs):
     dev_filenames = sorted(dev_filenames)
     return train_filenames, dev_filenames
 
-def main():
+def main(args=None):
     paths = get_default_paths()
     parser = argparse.ArgumentParser(
         prog='Convert UDCoref Data',
     )
     parser.add_argument('--split_test', default=None, type=float, help='How much of the data to randomly split from train to make a test set')
+    parser.add_argument('--output_directory', default=None, type=str, help='Where to output the data (defaults to %s)' % paths['COREF_DATA_DIR'])
 
     group = parser.add_mutually_exclusive_group(required=True)
     group.add_argument('--directory', type=str, help="the name of the subfolder for data conversion")
     group.add_argument('--project', type=str, help="Look for and use a set of datasets for data conversion - Slavic or Hungarian")
     group.add_argument('--languages', type=str, help="Only use these specific languages from the coref directory")
 
-    args = parser.parse_args()
+    args = parser.parse_args(args=args)
     coref_input_path = paths['COREF_BASE']
-    coref_output_path = paths['COREF_DATA_DIR']
+    coref_output_path = args.output_directory if args.output_directory else paths['COREF_DATA_DIR']
 
     if args.languages:
         langs = args.languages.split(",")
@@ -369,7 +373,7 @@ def main():
             conll_path = args.directory
         train_filenames = sorted(glob.glob(os.path.join(conll_path, f"*train.conllu")))
         dev_filenames = sorted(glob.glob(os.path.join(conll_path, f"*dev.conllu")))
-    process_dataset(project, coref_output_path, args.split_test, train_filenames, dev_filenames)
+    return process_dataset(project, coref_output_path, args.split_test, train_filenames, dev_filenames)
 
 if __name__ == '__main__':
     main()