1111Original code repository: https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/utils/predict_script.py
1212"""
1313
14- import re
1514import string
1615import typing as tp
1716from collections import Counter , defaultdict
18- from pathlib import Path
1917
2018
2119SCRIPT_RANGES = {
185183}
186184
187185
188- def get_script_map (language_script_file : Path ) -> tp .Dict [str , str ]:
189- """Returns a dict mapping a lang to its expected script in a single read run"""
190- lang_map : tp .Dict [str , str ] = defaultdict (str )
191- with language_script_file .open ("r" , encoding = "utf-8" ) as ls :
192- for row in ls :
193- columns = row .split ("\t " )
194- lang_map [columns [0 ]] = columns [1 ]
195- return lang_map
196-
197-
198- def find_lang_script (lang : str , language_script_file : Path ) -> tp .Optional [str ]:
199- """Returns the expected script for a single lang"""
200- with language_script_file .open ("r" , encoding = "utf-8" ) as ls :
201- for row in ls :
202- if row .startswith (lang ):
203- columns = row .split ("\t " )
204- return columns [1 ]
205- return None
206-
207-
208186ScoredScript = tp .Tuple [tp .Optional [str ], float ]
209187
210188
@@ -224,48 +202,19 @@ def get_script_predictor() -> tp.Callable[[str], ScoredScript]:
224202 for c in string .whitespace + string .punctuation + string .digits
225203 }
226204
227- def predict_script_org (sent : str ) -> ScoredScript :
228- sent = sent .translate (replacement_map )
229-
230- char_counts = Counter (sent ).most_common ()
231-
232- script_count : tp .Dict [str , int ] = defaultdict (int )
233- total = 0
234-
235- for char , count in char_counts :
236- ordinal = ord (char )
237- for script_name in hist_map .get (ordinal , []):
238- total += count
239- script_count [script_name ] += count
240-
241- max_score = 0.0
242- max_script = None
243- for script , count in script_count .items ():
244- score = abs (count / total )
245- if score > max_score :
246- max_score = score
247- max_script = script
248-
249- if len (script_count ) > 1 and max_score == (1 / len (script_count )):
250- return (None , 0 )
251-
252- return (max_script , max_score )
253-
254-
255205 def predict_script (sent : str ) -> ScoredScript :
256206 sent = sent .translate (replacement_map )
257207
258208 char_counts = Counter (sent )
259209 script_count : tp .Dict [str , int ] = defaultdict (int )
260- total = 0
261210
262211 for char , count in char_counts .items ():
263212 ordinal = ord (char )
264213 for script_name in hist_map .get (ordinal , []):
265214 script_count [script_name ] += count
266215
267216
268- # sort script_count
217+ # sort script_count alphabetically
269218 script_count = dict (sorted (script_count .items ()))
270219
271220 max_score = 0.0
@@ -277,16 +226,17 @@ def predict_script(sent: str) -> ScoredScript:
277226 max_script = script
278227
279228
280- # Report all the scores
229+ # sort all the scores
281230 sorted_scores = {script : abs (count / len (sent )) for script , count in script_count .items ()}
282231 sorted_scores = dict (sorted (sorted_scores .items (), key = lambda item : item [1 ], reverse = True ))
283232
284233 if len (sorted_scores ) > 1 :
285234 second_score = list (sorted_scores .values ())[1 ]
286235 interval = max_score - second_score
287236 tie = True if interval == 0 else False
288-
289237 return (max_script , max_score , {'details' : sorted_scores , 'tie' : tie , 'interval' : interval })
238+ elif max_score == 0 :
239+ return (None , 0 , {'details' : None , 'tie' : None , 'interval' : None })
290240 else :
291241 return (max_script , max_score , {'details' : sorted_scores , 'tie' : False , 'interval' : 1 })
292242
0 commit comments