Skip to content

Commit e6978b4

Browse files
committed
integration of the new kenlm models
1 parent 7ed52f9 commit e6978b4

File tree

4 files changed

+30
-27
lines changed

4 files changed

+30
-27
lines changed

ac_dc/download_sentencepiece_kenlm_models.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
"""Download Sentencepiece and KenLM models for supported languages (48) from Facebook.
1+
"""Download Sentencepiece and KenLM models for supported languages.
22
33
Usage:
4-
python download_sentencepiece_kenlm_models.py --output_path /tmp/
4+
python download_sentencepiece_kenlm_models.py --output_dir_path /tmp/
55
66
All Sentencepiece and KenLM language models will be saved under /tmp.
77
"""
@@ -12,12 +12,12 @@
1212
from languages_id import langs_id
1313

1414

15-
def download_sentencepiece_kenlm_models(output_path: str) -> None:
15+
def download_sentencepiece_kenlm_models(output_dir_path: str) -> None:
1616
supported_sentencepiece_langs = langs_id["sentencepiece_id"].dropna().unique()
1717
for lang in supported_sentencepiece_langs:
1818
try:
1919
output_sentencepiece = subprocess.check_output(
20-
f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model -P {output_path}",
20+
f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.sp.model -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.sp.model for FB models
2121
shell=True,
2222
)
2323
except:
@@ -29,7 +29,7 @@ def download_sentencepiece_kenlm_models(output_path: str) -> None:
2929
for lang in supported_kenlm_langs:
3030
try:
3131
output_kenlm = subprocess.check_output(
32-
f"wget http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin -P {output_path}",
32+
f"wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/{lang}.arpa.bin -P {output_dir_path}", # http://dl.fbaipublicfiles.com/cc_net/lm/{lang}.arpa.bin for FB models
3333
shell=True,
3434
)
3535
except:
@@ -41,8 +41,11 @@ def download_sentencepiece_kenlm_models(output_path: str) -> None:
4141
description="Download Sentencepiece and KenLM models for supported languages."
4242
)
4343
parser.add_argument(
44-
"--output_path", type=str, default="/tmp/", help="Output path to save models."
44+
"--output_dir_path",
45+
type=str,
46+
default="/tmp/",
47+
help="Output directory path to save models.",
4548
)
4649
args = parser.parse_args()
4750

48-
download_sentencepiece_kenlm_models(output_path=args.output_path)
51+
download_sentencepiece_kenlm_models(output_dir_path=args.output_dir_path)

ac_dc/filtering.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,7 @@ def compute_perplexity_score(document, sentencepiece_model, kenlm_model):
687687
document=document,
688688
remove_non_printing_characters=True,
689689
strip=True,
690-
lower_case=True,
690+
lower_case=False,
691691
uniform_whitespace=True,
692692
replace_digits_with_zeros=True,
693693
replace_unicode_punctuation=True,

ac_dc/languages_id.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@
2626
"stopwords_id": None,
2727
"flagged_words_id": None,
2828
"fasttext_id": "arz",
29-
"sentencepiece_id": None,
30-
"kenlm_id": None,
29+
"sentencepiece_id": "arz",
30+
"kenlm_id": "arz",
3131
},
3232
{
3333
"lang": "Assamese",
3434
"dataset_id": "as",
3535
"stopwords_id": None,
3636
"flagged_words_id": None,
3737
"fasttext_id": "as",
38-
"sentencepiece_id": None,
39-
"kenlm_id": None,
38+
"sentencepiece_id": "as",
39+
"kenlm_id": "as",
4040
},
4141
{
4242
"lang": "Bengali",
@@ -80,8 +80,8 @@
8080
"stopwords_id": "eu",
8181
"flagged_words_id": "eu",
8282
"fasttext_id": "eu",
83-
"sentencepiece_id": None,
84-
"kenlm_id": None,
83+
"sentencepiece_id": "eu",
84+
"kenlm_id": "eu",
8585
},
8686
{
8787
"lang": "French",
@@ -170,53 +170,53 @@
170170
"stopwords_id": "sw",
171171
"flagged_words_id": None,
172172
"fasttext_id": "sw",
173-
"sentencepiece_id": None,
174-
"kenlm_id": None,
173+
"sentencepiece_id": "sw",
174+
"kenlm_id": "sw",
175175
},
176176
{
177177
"lang": "Tamil",
178178
"dataset_id": "ta",
179179
"stopwords_id": None,
180180
"flagged_words_id": None,
181181
"fasttext_id": "ta",
182-
"sentencepiece_id": None,
183-
"kenlm_id": None,
182+
"sentencepiece_id": "ta",
183+
"kenlm_id": "ta",
184184
},
185185
{
186186
"lang": "Telugu",
187187
"dataset_id": "te",
188188
"stopwords_id": None,
189189
"flagged_words_id": "te",
190190
"fasttext_id": "te",
191-
"sentencepiece_id": None,
192-
"kenlm_id": None,
191+
"sentencepiece_id": "te",
192+
"kenlm_id": "te",
193193
},
194194
{
195195
"lang": "Urdu",
196196
"dataset_id": "ur",
197197
"stopwords_id": "ur",
198198
"flagged_words_id": None,
199199
"fasttext_id": "ur",
200-
"sentencepiece_id": None,
201-
"kenlm_id": None,
200+
"sentencepiece_id": "ur",
201+
"kenlm_id": "ur",
202202
},
203203
{
204204
"lang": "Vietnamese",
205205
"dataset_id": "vi",
206206
"stopwords_id": "vi",
207207
"flagged_words_id": "vi",
208208
"fasttext_id": "vi",
209-
"sentencepiece_id": None,
210-
"kenlm_id": None,
209+
"sentencepiece_id": "vi",
210+
"kenlm_id": "vi",
211211
},
212212
{
213213
"lang": "Yoruba",
214214
"dataset_id": "yo",
215215
"stopwords_id": "yo",
216216
"flagged_words_id": None,
217217
"fasttext_id": "yo",
218-
"sentencepiece_id": None,
219-
"kenlm_id": None,
218+
"sentencepiece_id": "yo",
219+
"kenlm_id": "yo",
220220
},
221221
{
222222
"lang": "Chinese",

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def compute_stats(self):
177177
path_fasttext_model = "ac_dc/lid.176.bin"
178178
path_sentencepiece_model = f"ac_dc/en.sp.model"
179179
path_kenlm_model = f"ac_dc/en.arpa.bin"
180-
path_save_stats = f"ac_dc/visualization/en_examples_with_stats_register.json"
180+
path_save_stats = f"ac_dc/visualization/en_examples_with_stats.json"
181181

182182
dataset = load_dataset(
183183
dataset_name,

0 commit comments

Comments
 (0)