Do not convert Chinese Traditional to Simplified for the target language (#1049)

evgenyrp · web-flow · commit 7bd75ad5a02d · 2025-03-31T18:26:45.000-07:00
* Filter out Traditional when Chinese is a target language

* Filter out Traditional when Chinese is a target language in mono corpus

* Fix handling parallel corpus

* Do not use env

* Use pathlib

* Fix and rename paths
diff --git a/pipeline/data/cjk.py b/pipeline/data/cjk.py
@@ -10,7 +10,9 @@
 
 from pipeline.common.datasets import Statistics
 from pipeline.common.downloads import read_lines, write_lines
+from pipeline.common.logging import get_logger
 
+logger = get_logger(__file__)
 
 CJK_LANGS = ["zh", "ja", "ko"]
 
@@ -26,10 +28,13 @@ class ConversionStep(Statistics):
     When converting data, count how many sentences were converted, and how many were visited.
     """
 
-    def __init__(self, description: str, converted=0, dataset_path: Optional[Path] = None) -> None:
+    def __init__(
+        self, description: str, converted=0, filtered=0, dataset_path: Optional[Path] = None
+    ) -> None:
         super().__init__(dataset_path)
         self.description = description
         self.converted = converted
+        self.filtered = filtered
         self.visited = 0
 
 
@@ -38,7 +43,7 @@ def __init__(self, dataset_path: Path, script: ChineseType) -> None:
         super().__init__(dataset_path)
         self.script = script
         self.script_conversion = ConversionStep(
-            f"How many sentences in the dataset were converted to {script.name}",
+            f"How many sentences in the dataset were converted to {script.name} or filtered",
         )
 
 
@@ -50,6 +55,9 @@ def __init__(self):
     def convert_file(
         self, input_path: Path, output_path: Path, to: ChineseType
     ) -> DatasetStatistics:
+        """
+        Convert all lines to one variant of Chinese
+        """
         stats = DatasetStatistics(output_path, to)
         with write_lines(output_path) as out_file, read_lines(input_path) as lines:
             for line in lines:
@@ -63,6 +71,51 @@ def convert_file(
                 out_file.write(new_line)
         return stats
 
+    def filter_file(self, input_path: Path, output_path: Path, variant: ChineseType):
+        """
+        Filter everything except the specified variant of Chinese
+        """
+        stats = DatasetStatistics(output_path, variant)
+        with write_lines(output_path) as out_file, read_lines(input_path) as lines:
+            for line in lines:
+                stats.script_conversion.visited += 1
+                ch_type = self._detect(line)
+                if ch_type == variant:
+                    out_file.write(line)
+                else:
+                    stats.script_conversion.filtered += 1
+
+        return stats
+
+    def filter_parallel_corpus(
+        self,
+        zh_path: Path,
+        other_path: Path,
+        zh_output_path: Path,
+        other_output_path: Path,
+        variant: ChineseType,
+    ):
+        """
+        Filter everything except the specified variant of Chinese in a parallel corpus
+        """
+        stats = DatasetStatistics(zh_output_path, variant)
+        with (
+            write_lines(zh_output_path) as zh_out_file,
+            write_lines(other_output_path) as other_out_file,
+            read_lines(zh_path) as zh_lines,
+            read_lines(other_path) as other_lines,
+        ):
+            for zh_line, other_line in zip(zh_lines, other_lines):
+                stats.script_conversion.visited += 1
+                ch_type = self._detect(zh_line)
+                if ch_type == variant:
+                    zh_out_file.write(zh_line)
+                    other_out_file.write(other_line)
+                else:
+                    stats.script_conversion.filtered += 1
+
+        return stats
+
     @staticmethod
     def _detect(text) -> ChineseType:
         res = hanzidentifier.identify(text)
@@ -80,3 +133,56 @@ def _convert_line(self, text: str, to: ChineseType) -> str:
         elif to == ChineseType.traditional:
             return self.s2t.convert(text)
         raise ValueError(f"Unsupported type: {to}")
+
+
+def handle_chinese_mono(file_destination: Path, is_src: bool, variant: ChineseType):
+    converted_path = file_destination.with_suffix(".converted.zst")
+    chinese_converter = ChineseConverter()
+    if is_src:
+        logger.info(f"Converting the output file to {variant}")
+        stats = chinese_converter.convert_file(file_destination, converted_path, variant)
+    else:
+        logger.info(f"Filtering out everything except {variant} in the output file")
+        stats = chinese_converter.filter_file(file_destination, converted_path, variant)
+    converted_path.replace(file_destination)
+    print(
+        f"Converted {stats.script_conversion.converted}, Filtered: {stats.script_conversion.filtered} Visited: {stats.script_conversion.visited}"
+    )
+    stats.save_json()
+
+
+def handle_chinese_parallel(output_prefix: str, src: str, trg: str, variant: ChineseType):
+    if "zh" not in (src, trg):
+        raise ValueError("Run only for Chinese")
+
+    chinese_converter = ChineseConverter()
+    is_src = src == "zh"
+    if is_src:
+        logger.info(f"Converting the output file to {variant}")
+        input_path = Path(f"{output_prefix}.{src}.zst")
+        converted_path = Path(f"{output_prefix}.converted.{src}.zst")
+        stats = chinese_converter.convert_file(
+            input_path=input_path,
+            output_path=converted_path,
+            to=variant,
+        )
+        converted_path.replace(input_path)
+    else:
+        logger.info(f"Filtering out everything except {variant} from a parallel corpus")
+        trg_path = Path(f"{output_prefix}.{trg}.zst")
+        src_path = Path(f"{output_prefix}.{src}.zst")
+        trg_filtered_path = Path(f"{output_prefix}.filtered.{trg}.zst")
+        src_filtered_path = Path(f"{output_prefix}.filtered.{src}.zst")
+        stats = chinese_converter.filter_parallel_corpus(
+            zh_path=trg_path,
+            other_path=src_path,
+            zh_output_path=trg_filtered_path,
+            other_output_path=src_filtered_path,
+            variant=variant,
+        )
+        src_filtered_path.replace(src_path)
+        trg_filtered_path.replace(trg_path)
+    print(
+        f"Converted {stats.script_conversion.converted}, Filtered: {stats.script_conversion.filtered} Visited: {stats.script_conversion.visited}"
+    )
+    stats.save_json()
diff --git a/pipeline/data/dataset_importer.py b/pipeline/data/dataset_importer.py
@@ -15,10 +15,8 @@
 import os
 import random
 import re
-import shutil
 import subprocess
 import sys
-from pathlib import Path
 from typing import Dict, Iterable, List
 
 from opustrainer.modifiers.noise import NoiseModifier
@@ -28,7 +26,7 @@
 from opustrainer.types import Modifier
 
 from pipeline.common.downloads import compress_file, decompress_file
-from pipeline.data.cjk import ChineseConverter, ChineseType
+from pipeline.data.cjk import handle_chinese_parallel, ChineseType
 
 random.seed(1111)
 
@@ -244,23 +242,12 @@ def run_import(
             [os.path.join(current_dir, "download-corpus.sh"), no_aug_id, output_prefix],
             env={"SRC": src, "TRG": trg},
         )
-
-        # TODO: convert everything to Chinese simplified for now
+        # TODO: convert everything to Chinese simplified for now when Chinese is the source language
         # TODO: https://github.com/mozilla/firefox-translations-training/issues/896
-        for lang in (src, trg):
-            if lang == "zh":
-                print("Converting the output file to Chinese Simplified")
-                chinese_converter = ChineseConverter()
-                stats = chinese_converter.convert_file(
-                    Path(f"{output_prefix}.{lang}.zst"),
-                    Path(f"{output_prefix}.converted.{lang}.zst"),
-                    ChineseType.simplified,
-                )
-                shutil.move(f"{output_prefix}.converted.{lang}.zst", f"{output_prefix}.{lang}.zst")
-                print(
-                    f"Converted {stats.script_conversion.converted} lines from {stats.script_conversion.visited} to Chinese Simplified"
-                )
-                stats.save_json()
+        if "zh" in (src, trg):
+            handle_chinese_parallel(
+                output_prefix, src=src, trg=trg, variant=ChineseType.simplified
+            )
 
         if aug_modifer:
             print("Running augmentation")
diff --git a/pipeline/data/download-mono.py b/pipeline/data/download-mono.py
@@ -21,7 +21,6 @@
 
 import argparse
 import os
-import shutil
 from contextlib import ExitStack
 from pathlib import Path
 from typing import Optional
@@ -35,7 +34,7 @@
     write_lines,
 )
 from pipeline.common.logging import get_logger
-from pipeline.data.cjk import ChineseConverter, ChineseType
+from pipeline.data.cjk import handle_chinese_mono, ChineseType
 
 CURRENT_FOLDER = os.path.dirname(os.path.abspath(__file__))
 IMPORTERS_PATH = os.path.abspath(os.path.join(CURRENT_FOLDER, "mono"))
@@ -50,6 +49,8 @@ def main(args_list: Optional[list[str]] = None) -> None:
     )
     parser.add_argument("--dataset", type=str, help="The key for the dataset")
     parser.add_argument("--language", type=str, help="The BCP 47 language tag of the dataset")
+    parser.add_argument("--src", type=bool, help="Source language of a language pair")
+    parser.add_argument("--trg", type=bool, help="Target language of a language pair")
     parser.add_argument(
         "--max_sentences", type=int, help="The maximum number of sentences to retain"
     )
@@ -133,20 +134,12 @@ def main(args_list: Optional[list[str]] = None) -> None:
         ):
             outfile.write(line)
 
-    # TODO: convert everything to Chinese simplified for now
-    # TODO: https://github.com/mozilla/firefox-translations-training/issues/896
     if args.language == "zh":
-        logger.info("Converting the output file to Chinese Simplified")
-        chinese_converter = ChineseConverter()
-        converted_path = file_destination.with_suffix(".converted.zst")
-        stats = chinese_converter.convert_file(
-            file_destination, converted_path, ChineseType.simplified
+        # TODO: convert everything to Chinese simplified for now when Chinese is the source language
+        # TODO: https://github.com/mozilla/firefox-translations-training/issues/896
+        handle_chinese_mono(
+            file_destination, is_src=args.src == "zh", variant=ChineseType.simplified
         )
-        shutil.move(converted_path, file_destination)
-        print(
-            f"Converted {stats.script_conversion.converted} lines from {stats.script_conversion.visited} to Chinese Simplified"
-        )
-        stats.save_json()
 
 
 if __name__ == "__main__":
diff --git a/taskcluster/kinds/dataset/kind.yml b/taskcluster/kinds/dataset/kind.yml
@@ -192,6 +192,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {src_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
 
@@ -225,6 +227,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {trg_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
 
@@ -258,6 +262,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {src_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
 
@@ -291,6 +297,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {trg_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
 
@@ -328,6 +336,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {src_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --hplt_min_doc_score {hplt_min_doc_score}
                     --artifacts $TASK_WORKDIR/artifacts
@@ -367,6 +377,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {trg_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --hplt_min_doc_score {hplt_min_doc_score}
                     --artifacts $TASK_WORKDIR/artifacts
@@ -430,6 +442,8 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {src_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
 
@@ -464,5 +478,7 @@ tasks:
                     python3 $VCS_PATH/pipeline/data/download-mono.py
                     --dataset {dataset}
                     --language {trg_locale}
+                    --src {src_locale}
+                    --trg {trg_locale}
                     --max_sentences {max_sentences}
                     --artifacts $TASK_WORKDIR/artifacts
diff --git a/tests/test_cjk.py b/tests/test_cjk.py