Skip to content

Commit 7bd75ad

Browse files
authored
Do not convert Chinese Traditional to Simplified for the target language (#1049)
* Filter out Traditional when Chinese is a target language * Filter out Traditional when Chinese is a target language in mono corpus * Fix handling parallel corpus * Do not use env * Use pathlib * Fix and rename paths
1 parent b649590 commit 7bd75ad

File tree

5 files changed

+296
-59
lines changed

5 files changed

+296
-59
lines changed

pipeline/data/cjk.py

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
from pipeline.common.datasets import Statistics
1212
from pipeline.common.downloads import read_lines, write_lines
13+
from pipeline.common.logging import get_logger
1314

15+
logger = get_logger(__file__)
1416

1517
CJK_LANGS = ["zh", "ja", "ko"]
1618

@@ -26,10 +28,13 @@ class ConversionStep(Statistics):
2628
When converting data, count how many sentences were converted, and how many were visited.
2729
"""
2830

29-
def __init__(self, description: str, converted=0, dataset_path: Optional[Path] = None) -> None:
31+
def __init__(
32+
self, description: str, converted=0, filtered=0, dataset_path: Optional[Path] = None
33+
) -> None:
3034
super().__init__(dataset_path)
3135
self.description = description
3236
self.converted = converted
37+
self.filtered = filtered
3338
self.visited = 0
3439

3540

@@ -38,7 +43,7 @@ def __init__(self, dataset_path: Path, script: ChineseType) -> None:
3843
super().__init__(dataset_path)
3944
self.script = script
4045
self.script_conversion = ConversionStep(
41-
f"How many sentences in the dataset were converted to {script.name}",
46+
f"How many sentences in the dataset were converted to {script.name} or filtered",
4247
)
4348

4449

@@ -50,6 +55,9 @@ def __init__(self):
5055
def convert_file(
5156
self, input_path: Path, output_path: Path, to: ChineseType
5257
) -> DatasetStatistics:
58+
"""
59+
Convert all lines to one variant of Chinese
60+
"""
5361
stats = DatasetStatistics(output_path, to)
5462
with write_lines(output_path) as out_file, read_lines(input_path) as lines:
5563
for line in lines:
@@ -63,6 +71,51 @@ def convert_file(
6371
out_file.write(new_line)
6472
return stats
6573

74+
def filter_file(self, input_path: Path, output_path: Path, variant: ChineseType):
75+
"""
76+
Filter everything except the specified variant of Chinese
77+
"""
78+
stats = DatasetStatistics(output_path, variant)
79+
with write_lines(output_path) as out_file, read_lines(input_path) as lines:
80+
for line in lines:
81+
stats.script_conversion.visited += 1
82+
ch_type = self._detect(line)
83+
if ch_type == variant:
84+
out_file.write(line)
85+
else:
86+
stats.script_conversion.filtered += 1
87+
88+
return stats
89+
90+
def filter_parallel_corpus(
91+
self,
92+
zh_path: Path,
93+
other_path: Path,
94+
zh_output_path: Path,
95+
other_output_path: Path,
96+
variant: ChineseType,
97+
):
98+
"""
99+
Filter everything except the specified variant of Chinese in a parallel corpus
100+
"""
101+
stats = DatasetStatistics(zh_output_path, variant)
102+
with (
103+
write_lines(zh_output_path) as zh_out_file,
104+
write_lines(other_output_path) as other_out_file,
105+
read_lines(zh_path) as zh_lines,
106+
read_lines(other_path) as other_lines,
107+
):
108+
for zh_line, other_line in zip(zh_lines, other_lines):
109+
stats.script_conversion.visited += 1
110+
ch_type = self._detect(zh_line)
111+
if ch_type == variant:
112+
zh_out_file.write(zh_line)
113+
other_out_file.write(other_line)
114+
else:
115+
stats.script_conversion.filtered += 1
116+
117+
return stats
118+
66119
@staticmethod
67120
def _detect(text) -> ChineseType:
68121
res = hanzidentifier.identify(text)
@@ -80,3 +133,56 @@ def _convert_line(self, text: str, to: ChineseType) -> str:
80133
elif to == ChineseType.traditional:
81134
return self.s2t.convert(text)
82135
raise ValueError(f"Unsupported type: {to}")
136+
137+
138+
def handle_chinese_mono(file_destination: Path, is_src: bool, variant: ChineseType):
139+
converted_path = file_destination.with_suffix(".converted.zst")
140+
chinese_converter = ChineseConverter()
141+
if is_src:
142+
logger.info(f"Converting the output file to {variant}")
143+
stats = chinese_converter.convert_file(file_destination, converted_path, variant)
144+
else:
145+
logger.info(f"Filtering out everything except {variant} in the output file")
146+
stats = chinese_converter.filter_file(file_destination, converted_path, variant)
147+
converted_path.replace(file_destination)
148+
print(
149+
f"Converted {stats.script_conversion.converted}, Filtered: {stats.script_conversion.filtered} Visited: {stats.script_conversion.visited}"
150+
)
151+
stats.save_json()
152+
153+
154+
def handle_chinese_parallel(output_prefix: str, src: str, trg: str, variant: ChineseType):
155+
if "zh" not in (src, trg):
156+
raise ValueError("Run only for Chinese")
157+
158+
chinese_converter = ChineseConverter()
159+
is_src = src == "zh"
160+
if is_src:
161+
logger.info(f"Converting the output file to {variant}")
162+
input_path = Path(f"{output_prefix}.{src}.zst")
163+
converted_path = Path(f"{output_prefix}.converted.{src}.zst")
164+
stats = chinese_converter.convert_file(
165+
input_path=input_path,
166+
output_path=converted_path,
167+
to=variant,
168+
)
169+
converted_path.replace(input_path)
170+
else:
171+
logger.info(f"Filtering out everything except {variant} from a parallel corpus")
172+
trg_path = Path(f"{output_prefix}.{trg}.zst")
173+
src_path = Path(f"{output_prefix}.{src}.zst")
174+
trg_filtered_path = Path(f"{output_prefix}.filtered.{trg}.zst")
175+
src_filtered_path = Path(f"{output_prefix}.filtered.{src}.zst")
176+
stats = chinese_converter.filter_parallel_corpus(
177+
zh_path=trg_path,
178+
other_path=src_path,
179+
zh_output_path=trg_filtered_path,
180+
other_output_path=src_filtered_path,
181+
variant=variant,
182+
)
183+
src_filtered_path.replace(src_path)
184+
trg_filtered_path.replace(trg_path)
185+
print(
186+
f"Converted {stats.script_conversion.converted}, Filtered: {stats.script_conversion.filtered} Visited: {stats.script_conversion.visited}"
187+
)
188+
stats.save_json()

pipeline/data/dataset_importer.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,8 @@
1515
import os
1616
import random
1717
import re
18-
import shutil
1918
import subprocess
2019
import sys
21-
from pathlib import Path
2220
from typing import Dict, Iterable, List
2321

2422
from opustrainer.modifiers.noise import NoiseModifier
@@ -28,7 +26,7 @@
2826
from opustrainer.types import Modifier
2927

3028
from pipeline.common.downloads import compress_file, decompress_file
31-
from pipeline.data.cjk import ChineseConverter, ChineseType
29+
from pipeline.data.cjk import handle_chinese_parallel, ChineseType
3230

3331
random.seed(1111)
3432

@@ -244,23 +242,12 @@ def run_import(
244242
[os.path.join(current_dir, "download-corpus.sh"), no_aug_id, output_prefix],
245243
env={"SRC": src, "TRG": trg},
246244
)
247-
248-
# TODO: convert everything to Chinese simplified for now
245+
# TODO: convert everything to Chinese simplified for now when Chinese is the source language
249246
# TODO: https://github.com/mozilla/firefox-translations-training/issues/896
250-
for lang in (src, trg):
251-
if lang == "zh":
252-
print("Converting the output file to Chinese Simplified")
253-
chinese_converter = ChineseConverter()
254-
stats = chinese_converter.convert_file(
255-
Path(f"{output_prefix}.{lang}.zst"),
256-
Path(f"{output_prefix}.converted.{lang}.zst"),
257-
ChineseType.simplified,
258-
)
259-
shutil.move(f"{output_prefix}.converted.{lang}.zst", f"{output_prefix}.{lang}.zst")
260-
print(
261-
f"Converted {stats.script_conversion.converted} lines from {stats.script_conversion.visited} to Chinese Simplified"
262-
)
263-
stats.save_json()
247+
if "zh" in (src, trg):
248+
handle_chinese_parallel(
249+
output_prefix, src=src, trg=trg, variant=ChineseType.simplified
250+
)
264251

265252
if aug_modifer:
266253
print("Running augmentation")

pipeline/data/download-mono.py

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import argparse
2323
import os
24-
import shutil
2524
from contextlib import ExitStack
2625
from pathlib import Path
2726
from typing import Optional
@@ -35,7 +34,7 @@
3534
write_lines,
3635
)
3736
from pipeline.common.logging import get_logger
38-
from pipeline.data.cjk import ChineseConverter, ChineseType
37+
from pipeline.data.cjk import handle_chinese_mono, ChineseType
3938

4039
CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))
4140
IMPORTERS_PATH = os.path.abspath(os.path.join(CURRENT_FOLDER, "mono"))
@@ -50,6 +49,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
5049
)
5150
parser.add_argument("--dataset", type=str, help="The key for the dataset")
5251
parser.add_argument("--language", type=str, help="The BCP 47 language tag of the dataset")
52+
parser.add_argument("--src", type=bool, help="Source language of a language pair")
53+
parser.add_argument("--trg", type=bool, help="Target language of a language pair")
5354
parser.add_argument(
5455
"--max_sentences", type=int, help="The maximum number of sentences to retain"
5556
)
@@ -133,20 +134,12 @@ def main(args_list: Optional[list[str]] = None) -> None:
133134
):
134135
outfile.write(line)
135136

136-
# TODO: convert everything to Chinese simplified for now
137-
# TODO: https://github.com/mozilla/firefox-translations-training/issues/896
138137
if args.language == "zh":
139-
logger.info("Converting the output file to Chinese Simplified")
140-
chinese_converter = ChineseConverter()
141-
converted_path = file_destination.with_suffix(".converted.zst")
142-
stats = chinese_converter.convert_file(
143-
file_destination, converted_path, ChineseType.simplified
138+
# TODO: convert everything to Chinese simplified for now when Chinese is the source language
139+
# TODO: https://github.com/mozilla/firefox-translations-training/issues/896
140+
handle_chinese_mono(
141+
file_destination, is_src=args.src == "zh", variant=ChineseType.simplified
144142
)
145-
shutil.move(converted_path, file_destination)
146-
print(
147-
f"Converted {stats.script_conversion.converted} lines from {stats.script_conversion.visited} to Chinese Simplified"
148-
)
149-
stats.save_json()
150143

151144

152145
if __name__ == "__main__":

taskcluster/kinds/dataset/kind.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ tasks:
192192
python3 $VCS_PATH/pipeline/data/download-mono.py
193193
--dataset {dataset}
194194
--language {src_locale}
195+
--src {src_locale}
196+
--trg {trg_locale}
195197
--max_sentences {max_sentences}
196198
--artifacts $TASK_WORKDIR/artifacts
197199
@@ -225,6 +227,8 @@ tasks:
225227
python3 $VCS_PATH/pipeline/data/download-mono.py
226228
--dataset {dataset}
227229
--language {trg_locale}
230+
--src {src_locale}
231+
--trg {trg_locale}
228232
--max_sentences {max_sentences}
229233
--artifacts $TASK_WORKDIR/artifacts
230234
@@ -258,6 +262,8 @@ tasks:
258262
python3 $VCS_PATH/pipeline/data/download-mono.py
259263
--dataset {dataset}
260264
--language {src_locale}
265+
--src {src_locale}
266+
--trg {trg_locale}
261267
--max_sentences {max_sentences}
262268
--artifacts $TASK_WORKDIR/artifacts
263269
@@ -291,6 +297,8 @@ tasks:
291297
python3 $VCS_PATH/pipeline/data/download-mono.py
292298
--dataset {dataset}
293299
--language {trg_locale}
300+
--src {src_locale}
301+
--trg {trg_locale}
294302
--max_sentences {max_sentences}
295303
--artifacts $TASK_WORKDIR/artifacts
296304
@@ -328,6 +336,8 @@ tasks:
328336
python3 $VCS_PATH/pipeline/data/download-mono.py
329337
--dataset {dataset}
330338
--language {src_locale}
339+
--src {src_locale}
340+
--trg {trg_locale}
331341
--max_sentences {max_sentences}
332342
--hplt_min_doc_score {hplt_min_doc_score}
333343
--artifacts $TASK_WORKDIR/artifacts
@@ -367,6 +377,8 @@ tasks:
367377
python3 $VCS_PATH/pipeline/data/download-mono.py
368378
--dataset {dataset}
369379
--language {trg_locale}
380+
--src {src_locale}
381+
--trg {trg_locale}
370382
--max_sentences {max_sentences}
371383
--hplt_min_doc_score {hplt_min_doc_score}
372384
--artifacts $TASK_WORKDIR/artifacts
@@ -430,6 +442,8 @@ tasks:
430442
python3 $VCS_PATH/pipeline/data/download-mono.py
431443
--dataset {dataset}
432444
--language {src_locale}
445+
--src {src_locale}
446+
--trg {trg_locale}
433447
--max_sentences {max_sentences}
434448
--artifacts $TASK_WORKDIR/artifacts
435449
@@ -464,5 +478,7 @@ tasks:
464478
python3 $VCS_PATH/pipeline/data/download-mono.py
465479
--dataset {dataset}
466480
--language {trg_locale}
481+
--src {src_locale}
482+
--trg {trg_locale}
467483
--max_sentences {max_sentences}
468484
--artifacts $TASK_WORKDIR/artifacts

0 commit comments

Comments
 (0)