3232# Setup
3333LOGGER , PATHS = shared .setup (__file__ )
3434
35- # CSV paths
35+ # Constants
3636FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "internetarchive_1_count.csv" )
3737FILE2_LANGUAGE = os .path .join (
3838 PATHS ["data_phase" ], "internetarchive_2_count_by_language.csv"
3939)
40-
41- # CSV headers
4240HEADER1 = ["LICENSE" , "COUNT" ]
4341HEADER2 = ["LICENSE" , "LANGUAGE" , "COUNT" ]
42+ ISO639_CACHE = {}
43+ LANGUAGE_ALIAS_MAP = {
44+ "american english" : "English" ,
45+ "american" : "English" ,
46+ "anglais" : "English" ,
47+ "bosanski" : "Bosnian" ,
48+ "castellano" : "Spanish" ,
49+ "chinese sub" : "Chinese" ,
50+ "deutsch" : "German" ,
51+ "egligh" : "English" ,
52+ "eglish" : "English" ,
53+ "en_us es_es" : "Multiple languages" ,
54+ "english & chinese subbed" : "Multiple languages" ,
55+ "english (us)" : "English" ,
56+ "english - american" : "English" ,
57+ "english_handwritten" : "English" ,
58+ "engrish" : "English" ,
59+ "enlgish" : "English" ,
60+ "espanol" : "Spanish" ,
61+ "francais" : "French" ,
62+ "france" : "French" ,
63+ "greek" : "Greek" ,
64+ "hwbrew" : "Hebrew" ,
65+ "ilokano" : "Ilokano" ,
66+ "indian english" : "English" ,
67+ "italiano" : "Italian" ,
68+ "mandarin" : "Chinese" ,
69+ "multi" : "Multiple Languages" ,
70+ "multilanguage" : "Multiple languages" ,
71+ "multiple" : "Multiple Languages" ,
72+ "music" : "Undetermined" ,
73+ "n/a" : "Undetermined" ,
74+ "nederlands" : "Dutch" ,
75+ "no language (english)" : "Undetermined" ,
76+ "no speech" : "Undetermined" ,
77+ "no spoken language" : "Undetermined" ,
78+ "none" : "Undetermined" ,
79+ "polska" : "Polish" ,
80+ "português e espanhol" : "Multiple languages" ,
81+ "português" : "Portuguese" ,
82+ "pt_br" : "Portuguese" ,
83+ "sgn" : "Sign languages" ,
84+ "spain" : "Spanish" ,
85+ "swahili" : "Swahili" ,
86+ "uk english" : "English" ,
87+ "unknown" : "Undetermined" ,
88+ "us english" : "English" ,
89+ "us-en" : "English" ,
90+ "viẹetnamese" : "Vietnamese" ,
91+ "whatever we play it to be" : "Undetermined" ,
92+ "русский" : "Russian" ,
93+ "український" : "Ukrainian" ,
94+ }
4495LIMIT_DEFAULT = 100000
4596QUARTER = os .path .basename (PATHS ["data_quarter" ])
4697
47- ISO639_CACHE = {}
48-
4998
5099def parse_arguments ():
51100 LOGGER .info ("Parsing command-line options" )
@@ -212,24 +261,23 @@ def is_multi_language(raw_language):
212261
213262
214263def normalize_language (raw_language ):
264+ raw = str (raw_language ).strip ()
215265 if not raw_language :
216266 return "Undetermined"
217267
218- raw = str (raw_language ).strip ()
219-
220- # check multi-language
268+ # 1st: check multi-language
221269 if is_multi_language (raw ):
222270 return "Multiple languages"
223271
224- # strip noise and normalize
272+ # Prep for subsequent checks by striping noise and normalizing
225273 cleaned = normalize_key (strip_noise (raw ))
226274
227- # --- Try ISO639 first ---
275+ # 2nd: check ISO639
228276 lang_obj = iso639_lookup (raw ) or iso639_lookup (cleaned )
229277 if lang_obj and getattr (lang_obj , "name" , None ):
230278 return lang_obj .name
231279
232- # Try Babel
280+ # 3rd: check Babel
233281 for cand in [raw , cleaned ]:
234282 if not cand :
235283 continue
@@ -240,63 +288,10 @@ def normalize_language(raw_language):
240288 except Exception :
241289 pass
242290
243- # --- Try Alias Map ---
244- ALIAS_MAP = {
245- "american english" : "English" ,
246- "american" : "English" ,
247- "anglais" : "English" ,
248- "bosanski" : "Bosnian" ,
249- "castellano" : "Spanish" ,
250- "chinese sub" : "Chinese" ,
251- "deutsch" : "German" ,
252- "egligh" : "English" ,
253- "eglish" : "English" ,
254- "en_us es_es" : "Multiple languages" ,
255- "english & chinese subbed" : "Multiple languages" ,
256- "english (us)" : "English" ,
257- "english - american" : "English" ,
258- "english_handwritten" : "English" ,
259- "engrish" : "English" ,
260- "enlgish" : "English" ,
261- "espanol" : "Spanish" ,
262- "francais" : "French" ,
263- "france" : "French" ,
264- "greek" : "Greek" ,
265- "hwbrew" : "Hebrew" ,
266- "ilokano" : "Ilokano" ,
267- "indian english" : "English" ,
268- "italiano" : "Italian" ,
269- "mandarin" : "Chinese" ,
270- "multi" : "Multiple Languages" ,
271- "multilanguage" : "Multiple languages" ,
272- "multiple" : "Multiple Languages" ,
273- "music" : "Undetermined" ,
274- "n/a" : "Undetermined" ,
275- "nederlands" : "Dutch" ,
276- "no language (english)" : "Undetermined" ,
277- "no speech" : "Undetermined" ,
278- "no spoken language" : "Undetermined" ,
279- "none" : "Undetermined" ,
280- "polska" : "Polish" ,
281- "português e espanhol" : "Multiple languages" ,
282- "português" : "Portuguese" ,
283- "pt_br" : "Portuguese" ,
284- "sgn" : "Sign languages" ,
285- "spain" : "Spanish" ,
286- "swahili" : "Swahili" ,
287- "uk english" : "English" ,
288- "unknown" : "Undetermined" ,
289- "us english" : "English" ,
290- "us-en" : "English" ,
291- "viẹetnamese" : "Vietnamese" ,
292- "whatever we play it to be" : "Undetermined" ,
293- "русский" : "Russian" ,
294- "український" : "Ukrainian" ,
295- }
296- ALIAS_MAP = {normalize_key (k ): v for k , v in ALIAS_MAP .items ()}
297-
298- if cleaned in ALIAS_MAP :
299- return ALIAS_MAP [cleaned ]
291+ # 4th: check language alias map
292+ alias_map = {normalize_key (k ): v for k , v in LANGUAGE_ALIAS_MAP .items ()}
293+ if cleaned in alias_map :
294+ return alias_map [cleaned ]
300295
301296 return "Undetermined"
302297
0 commit comments