Skip to content

Commit b7a7f77

Browse files
evgenyrpgregtatum
andauthored
MTdata fixes (#1118)
* Add support for mtdata bcp 47 codes * Add support for mtdata bcp 47 codes * Add retries for mtdata * Add augmentation to mtdata dev datasets * Fix typo Co-authored-by: Greg Tatum <gregtatum@users.noreply.github.com> --------- Co-authored-by: Greg Tatum <gregtatum@users.noreply.github.com>
1 parent 6545be6 commit b7a7f77

File tree

4 files changed

+44
-6
lines changed

4 files changed

+44
-6
lines changed

pipeline/data/parallel_downloaders.py

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import shutil
55
import subprocess
66
import tarfile
7+
import time
78
from enum import Enum
89
from pathlib import Path
910
import zipfile
@@ -80,14 +81,45 @@ def mtdata(src: str, trg: str, dataset: str, output_prefix: Path):
8081
tmp_dir = output_prefix.parent / "mtdata" / dataset
8182
tmp_dir.mkdir(parents=True, exist_ok=True)
8283

83-
run_command(["mtdata", "get", "-l", f"{src}-{trg}", "-tr", dataset, "-o", str(tmp_dir)])
84+
n = 3
85+
while True:
86+
try:
87+
run_command(
88+
["mtdata", "get", "-l", f"{src}-{trg}", "-tr", dataset, "-o", str(tmp_dir)]
89+
)
90+
break
91+
except Exception as ex:
92+
logger.warn(f"Error while downloading mtdata corpus: {ex}")
93+
if n == 1:
94+
logger.error("Exceeded the number of retries, downloading failed")
95+
raise
96+
n -= 1
97+
logger.info("Retrying in 60 seconds...")
98+
time.sleep(60)
99+
continue
84100

85101
for file in tmp_dir.rglob("*"):
86102
logger.info(file)
87103

88-
for lang in (src, trg):
89-
iso = iso3_code(lang, fail_error=True)
90-
file = tmp_dir / "train-parts" / f"{dataset}.{iso}"
104+
# some dataset names include BCP-47 country codes, e.g. OPUS-gnome-v1-eng-zho_CN
105+
src_suffix = None
106+
trg_suffix = None
107+
iso_src = iso3_code(src, fail_error=True)
108+
iso_trg = iso3_code(trg, fail_error=True)
109+
parts = dataset.split("-")
110+
code1, code2 = parts[-1], parts[-2]
111+
# make sure iso369 code matches the beginning of the mtdata langauge code (e.g. zho and zho_CN)
112+
if code1.startswith(iso_src) and code2.startswith(iso_trg):
113+
src_suffix = code1
114+
trg_suffix = code2
115+
elif code2.startswith(iso_src) and code1.startswith(iso_trg):
116+
src_suffix = code2
117+
trg_suffix = code1
118+
else:
119+
ValueError(f"Languages codes {code1}-{code2} do not match {iso_src}-{iso_trg}")
120+
121+
for lang, suffix in ((src, src_suffix), (trg, trg_suffix)):
122+
file = tmp_dir / "train-parts" / f"{dataset}.{suffix}"
91123
compressed_path = compress_file(file, keep_original=False, compression="zst")
92124
compressed_path.rename(output_prefix.with_suffix(f".{lang}.zst"))
93125

tests/fixtures/config.pytest.enzh.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ datasets:
4646
- opus_CCAligned/v1
4747
- opus_CCMatrix/v1
4848
- opus_ELRC-3075-wikipedia_health/v1
49+
- mtdata_OPUS-gnome-v1-eng-zho_CN
4950
mono-src:
5051
- news-crawl_news.2021
5152
- news-crawl_news.2020

tests/test_data_importer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def data_dir():
107107
"importer,src_lang,trg_lang,dataset",
108108
[
109109
("mtdata", "en", "ru", "Neulab-tedtalks_test-1-eng-rus"),
110+
("mtdata", "en", "zh", "OPUS-gnome-v1-eng-zho_CN"),
110111
("opus", "en", "ru", "ELRC-3075-wikipedia_health_v1"),
111112
("opus", "ru", "en", "ELRC-3075-wikipedia_health_v1"),
112113
("flores", "en", "ru", "dev"),

utils/config_generator.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
"swedish_work_environment",
4747
# Fails to load from mtdata.
4848
"lithuanian_legislation_seimas_lithuania",
49+
"Microsoft-ntrex",
4950
# Fails to load from OPUS.
5051
"SPC",
5152
# MTdata duplicates Flores that we pull directly
@@ -221,10 +222,13 @@ def add_train_data(
221222
for corpus_key, entry in entries.items():
222223
if entry.did.name in skip_datasets:
223224
continue
225+
modified_corpus_key = corpus_key
224226
# mtdata can have test and devtest data as well.
225227
if entry.did.name.endswith("test"):
226228
dataset = datasets["test"]
227229
elif entry.did.name.endswith("dev"):
230+
dataset_name = corpus_key[corpus_key.find("_") + 1 :]
231+
modified_corpus_key = f"mtdata_{aug_mix_modifier}_{dataset_name}"
228232
dataset = datasets["devtest"]
229233
else:
230234
dataset = datasets["train"]
@@ -243,7 +247,7 @@ def add_train_data(
243247

244248
if fast:
245249
# Just add the dataset when in fast mode.
246-
dataset.append(corpus_key)
250+
dataset.append(modified_corpus_key)
247251
else:
248252
byte_size = None
249253
display_size = None
@@ -264,7 +268,7 @@ def add_train_data(
264268
# Don't add the sentences to the total_sentences, as mtdata is less reliable
265269
# compared to opus.
266270
sentences = estimate_sentence_size(byte_size)
267-
dataset.append(corpus_key)
271+
dataset.append(modified_corpus_key)
268272
if byte_size:
269273
dataset.yaml_add_eol_comment( # type: ignore
270274
f"~{sentences:,} sentences ".rjust(70 - len(corpus_key), " ")

0 commit comments

Comments
 (0)