MTdata fixes (#1118)

evgenyrp · gregtatum · web-flow · commit b7a7f778be88 · 2025-05-12T13:37:55.000-07:00
* Add support for mtdata bcp 47 codes

* Add support for mtdata bcp 47 codes

* Add retries for mtdata

* Add augmentation to mtdata dev datasets

* Fix typo

Co-authored-by: Greg Tatum &lt;gregtatum@users.noreply.github.com&gt;

---------

Co-authored-by: Greg Tatum &lt;gregtatum@users.noreply.github.com&gt;
diff --git a/pipeline/data/parallel_downloaders.py b/pipeline/data/parallel_downloaders.py
@@ -4,6 +4,7 @@
 import shutil
 import subprocess
 import tarfile
+import time
 from enum import Enum
 from pathlib import Path
 import zipfile
@@ -80,14 +81,45 @@ def mtdata(src: str, trg: str, dataset: str, output_prefix: Path):
     tmp_dir = output_prefix.parent / "mtdata" / dataset
     tmp_dir.mkdir(parents=True, exist_ok=True)
 
-    run_command(["mtdata", "get", "-l", f"{src}-{trg}", "-tr", dataset, "-o", str(tmp_dir)])
+    n = 3
+    while True:
+        try:
+            run_command(
+                ["mtdata", "get", "-l", f"{src}-{trg}", "-tr", dataset, "-o", str(tmp_dir)]
+            )
+            break
+        except Exception as ex:
+            logger.warn(f"Error while downloading mtdata corpus: {ex}")
+            if n == 1:
+                logger.error("Exceeded the number of retries, downloading failed")
+                raise
+            n -= 1
+            logger.info("Retrying in 60 seconds...")
+            time.sleep(60)
+            continue
 
     for file in tmp_dir.rglob("*"):
         logger.info(file)
 
-    for lang in (src, trg):
-        iso = iso3_code(lang, fail_error=True)
-        file = tmp_dir / "train-parts" / f"{dataset}.{iso}"
+    # some dataset names include BCP-47 country codes, e.g. OPUS-gnome-v1-eng-zho_CN
+    src_suffix = None
+    trg_suffix = None
+    iso_src = iso3_code(src, fail_error=True)
+    iso_trg = iso3_code(trg, fail_error=True)
+    parts = dataset.split("-")
+    code1, code2 = parts[-1], parts[-2]
+    # make sure iso369 code matches the beginning of the mtdata langauge code (e.g. zho and zho_CN)
+    if code1.startswith(iso_src) and code2.startswith(iso_trg):
+        src_suffix = code1
+        trg_suffix = code2
+    elif code2.startswith(iso_src) and code1.startswith(iso_trg):
+        src_suffix = code2
+        trg_suffix = code1
+    else:
+        ValueError(f"Languages codes {code1}-{code2} do not match {iso_src}-{iso_trg}")
+
+    for lang, suffix in ((src, src_suffix), (trg, trg_suffix)):
+        file = tmp_dir / "train-parts" / f"{dataset}.{suffix}"
         compressed_path = compress_file(file, keep_original=False, compression="zst")
         compressed_path.rename(output_prefix.with_suffix(f".{lang}.zst"))
 
diff --git a/tests/fixtures/config.pytest.enzh.yml b/tests/fixtures/config.pytest.enzh.yml
@@ -46,6 +46,7 @@ datasets:
     - opus_CCAligned/v1
     - opus_CCMatrix/v1
     - opus_ELRC-3075-wikipedia_health/v1
+    - mtdata_OPUS-gnome-v1-eng-zho_CN
   mono-src:
     - news-crawl_news.2021
     - news-crawl_news.2020
diff --git a/tests/test_data_importer.py b/tests/test_data_importer.py
@@ -107,6 +107,7 @@ def data_dir():
     "importer,src_lang,trg_lang,dataset",
     [
         ("mtdata", "en", "ru", "Neulab-tedtalks_test-1-eng-rus"),
+        ("mtdata", "en", "zh", "OPUS-gnome-v1-eng-zho_CN"),
         ("opus", "en", "ru", "ELRC-3075-wikipedia_health_v1"),
         ("opus", "ru", "en", "ELRC-3075-wikipedia_health_v1"),
         ("flores", "en", "ru", "dev"),
diff --git a/utils/config_generator.py b/utils/config_generator.py
@@ -46,6 +46,7 @@
     "swedish_work_environment",
     # Fails to load from mtdata.
     "lithuanian_legislation_seimas_lithuania",
+    "Microsoft-ntrex",
     # Fails to load from OPUS.
     "SPC",
     # MTdata duplicates Flores that we pull directly
@@ -221,10 +222,13 @@ def add_train_data(
     for corpus_key, entry in entries.items():
         if entry.did.name in skip_datasets:
             continue
+        modified_corpus_key = corpus_key
         # mtdata can have test and devtest data as well.
         if entry.did.name.endswith("test"):
             dataset = datasets["test"]
         elif entry.did.name.endswith("dev"):
+            dataset_name = corpus_key[corpus_key.find("_") + 1 :]
+            modified_corpus_key = f"mtdata_{aug_mix_modifier}_{dataset_name}"
             dataset = datasets["devtest"]
         else:
             dataset = datasets["train"]
@@ -243,7 +247,7 @@ def add_train_data(
 
         if fast:
             # Just add the dataset when in fast mode.
-            dataset.append(corpus_key)
+            dataset.append(modified_corpus_key)
         else:
             byte_size = None
             display_size = None
@@ -264,7 +268,7 @@ def add_train_data(
                 # Don't add the sentences to the total_sentences, as mtdata is less reliable
                 # compared to opus.
                 sentences = estimate_sentence_size(byte_size)
-                dataset.append(corpus_key)
+                dataset.append(modified_corpus_key)
                 if byte_size:
                     dataset.yaml_add_eol_comment(  # type: ignore
                         f"~{sentences:,} sentences ".rjust(70 - len(corpus_key), " ")

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,7 @@ def data_dir():`
`107`	`107`	`"importer,src_lang,trg_lang,dataset",`
`108`	`108`	`[`
`109`	`109`	`("mtdata", "en", "ru", "Neulab-tedtalks_test-1-eng-rus"),`
	`110`	`+ ("mtdata", "en", "zh", "OPUS-gnome-v1-eng-zho_CN"),`
`110`	`111`	`("opus", "en", "ru", "ELRC-3075-wikipedia_health_v1"),`
`111`	`112`	`("opus", "ru", "en", "ELRC-3075-wikipedia_health_v1"),`
`112`	`113`	`("flores", "en", "ru", "dev"),`