integration of the new kenlm models

HugoLaurencon · HugoLaurencon · commit e6978b48b326 · 2022-01-28T22:06:27.000+01:00
diff --git a/ac_dc/download_sentencepiece_kenlm_models.py b/ac_dc/download_sentencepiece_kenlm_models.py
@@ -1,7 +1,7 @@
-"""Download Sentencepiece and KenLM models for supported languages (48) from Facebook.
+"""Download Sentencepiece and KenLM models for supported languages.
 
 Usage:
-    python download_sentencepiece_kenlm_models.py --output_path /tmp/
+    python download_sentencepiece_kenlm_models.py --output_dir_path /tmp/
 
 All Sentencepiece and KenLM language models will be saved under /tmp.
 """
@@ -12,12 +12,12 @@
 from languages_id import langs_id
 
 
-def download_sentencepiece_kenlm_models(output_path: str) -> None:
+def download_sentencepiece_kenlm_models(output_dir_path: str) -> None:
     supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique()
     for lang in supported_sentencepiece_langs:
         try:
             output_sentencepiece = subprocess.check_output(
-                f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}",
+                f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.sp.model -P {output_dir_path}",  # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model for FB models
                 shell=True,
             )
         except:
@@ -29,7 +29,7 @@ def download_sentencepiece_kenlm_models(output_path: str) -> None:
     for lang in supported_kenlm_langs:
         try:
             output_kenlm = subprocess.check_output(
-                f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin -P {output_path}",
+                f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.arpa.bin -P {output_dir_path}",  # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin for FB models
                 shell=True,
             )
         except:
@@ -41,8 +41,11 @@ def download_sentencepiece_kenlm_models(output_path: str) -> None:
         description="Download Sentencepiece and KenLM models for supported languages."
     )
     parser.add_argument(
-        "--output_path", type=str, default="/tmp/", help="Output path to save models."
+        "--output_dir_path",
+        type=str,
+        default="/tmp/",
+        help="Output directory path to save models.",
     )
     args = parser.parse_args()
 
-    download_sentencepiece_kenlm_models(output_path=args.output_path)
+    download_sentencepiece_kenlm_models(output_dir_path=args.output_dir_path)
diff --git a/ac_dc/filtering.py b/ac_dc/filtering.py
@@ -687,7 +687,7 @@ def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
             document=document,
             remove_non_printing_characters=True,
             strip=True,
-            lower_case=True,
+            lower_case=False,
             uniform_whitespace=True,
             replace_digits_with_zeros=True,
             replace_unicode_punctuation=True,
diff --git a/ac_dc/languages_id.py b/ac_dc/languages_id.py
@@ -26,17 +26,17 @@
         "stopwords_id": None,
         "flagged_words_id": None,
         "fasttext_id": "arz",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "arz",
+        "kenlm_id": "arz",
     },
     {
         "lang": "Assamese",
         "dataset_id": "as",
         "stopwords_id": None,
         "flagged_words_id": None,
         "fasttext_id": "as",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "as",
+        "kenlm_id": "as",
     },
     {
         "lang": "Bengali",
@@ -80,8 +80,8 @@
         "stopwords_id": "eu",
         "flagged_words_id": "eu",
         "fasttext_id": "eu",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "eu",
+        "kenlm_id": "eu",
     },
     {
         "lang": "French",
@@ -170,53 +170,53 @@
         "stopwords_id": "sw",
         "flagged_words_id": None,
         "fasttext_id": "sw",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "sw",
+        "kenlm_id": "sw",
     },
     {
         "lang": "Tamil",
         "dataset_id": "ta",
         "stopwords_id": None,
         "flagged_words_id": None,
         "fasttext_id": "ta",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "ta",
+        "kenlm_id": "ta",
     },
     {
         "lang": "Telugu",
         "dataset_id": "te",
         "stopwords_id": None,
         "flagged_words_id": "te",
         "fasttext_id": "te",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "te",
+        "kenlm_id": "te",
     },
     {
         "lang": "Urdu",
         "dataset_id": "ur",
         "stopwords_id": "ur",
         "flagged_words_id": None,
         "fasttext_id": "ur",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "ur",
+        "kenlm_id": "ur",
     },
     {
         "lang": "Vietnamese",
         "dataset_id": "vi",
         "stopwords_id": "vi",
         "flagged_words_id": "vi",
         "fasttext_id": "vi",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "vi",
+        "kenlm_id": "vi",
     },
     {
         "lang": "Yoruba",
         "dataset_id": "yo",
         "stopwords_id": "yo",
         "flagged_words_id": None,
         "fasttext_id": "yo",
-        "sentencepiece_id": None,
-        "kenlm_id": None,
+        "sentencepiece_id": "yo",
+        "kenlm_id": "yo",
     },
     {
         "lang": "Chinese",
diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py
@@ -177,7 +177,7 @@ def compute_stats(self):
     path_fasttext_model = "ac_dc/lid.176.bin"
     path_sentencepiece_model = f"ac_dc/en.sp.model"
     path_kenlm_model = f"ac_dc/en.arpa.bin"
-    path_save_stats = f"ac_dc/visualization/en_examples_with_stats_register.json"
+    path_save_stats = f"ac_dc/visualization/en_examples_with_stats.json"
 
     dataset = load_dataset(
         dataset_name,