3939)
4040HEADER1 = ["LICENSE" , "COUNT" ]
4141HEADER2 = ["LICENSE" , "LANGUAGE" , "COUNT" ]
42- ISO639_CACHE = {}
4342LANGUAGE_ALIAS_MAP = {
4443 "american english" : "English" ,
4544 "american" : "English" ,
4645 "anglais" : "English" ,
47- "bosanski" : "Bosnian" ,
4846 "castellano" : "Spanish" ,
4947 "chinese sub" : "Chinese" ,
50- "deutsch" : "German" ,
5148 "egligh" : "English" ,
5249 "eglish" : "English" ,
5350 "en_us es_es" : "Multiple languages" ,
5754 "english_handwritten" : "English" ,
5855 "engrish" : "English" ,
5956 "enlgish" : "English" ,
60- "espanol" : "Spanish" ,
61- "francais" : "French" ,
6257 "france" : "French" ,
63- "greek" : "Greek" ,
6458 "hwbrew" : "Hebrew" ,
6559 "ilokano" : "Ilokano" ,
6660 "indian english" : "English" ,
67- "italiano" : "Italian" ,
6861 "mandarin" : "Chinese" ,
6962 "multi" : "Multiple Languages" ,
7063 "multilanguage" : "Multiple languages" ,
7164 "multiple" : "Multiple Languages" ,
72- "music" : "Undetermined" ,
73- "n/a" : "Undetermined" ,
74- "nederlands" : "Dutch" ,
75- "no language (english)" : "Undetermined" ,
76- "no speech" : "Undetermined" ,
77- "no spoken language" : "Undetermined" ,
78- "none" : "Undetermined" ,
7965 "polska" : "Polish" ,
8066 "português e espanhol" : "Multiple languages" ,
81- "português" : "Portuguese" ,
82- "pt_br" : "Portuguese" ,
8367 "sgn" : "Sign languages" ,
8468 "spain" : "Spanish" ,
85- "swahili" : "Swahili" ,
8669 "uk english" : "English" ,
87- "unknown" : "Undetermined" ,
8870 "us english" : "English" ,
8971 "us-en" : "English" ,
9072 "viẹetnamese" : "Vietnamese" ,
91- "whatever we play it to be" : "Undetermined" ,
92- "русский" : "Russian" ,
9373 "український" : "Ukrainian" ,
9474}
75+ LANGUAGE_NAME_MAP = {} # Populated by create_language_name_map()
9576LANGUAGE_NOISE_WORDS = [
9677 "-handwritten" ,
9778 "-spoken" ,
@@ -157,6 +138,23 @@ def load_license_mapping():
157138 return license_mapping
158139
159140
141+ def create_language_name_map ():
142+ global LANGUAGE_NAME_MAP
143+ for locale_code in babel .localedata .locale_identifiers ():
144+ locale = babel .Locale .parse (locale_code )
145+ # Localized/native language name
146+ name = normalize_key (locale .display_name )
147+ if not name :
148+ continue
149+ LANGUAGE_NAME_MAP [name .lower ()] = locale .language
150+ # English language name
151+ name = normalize_key (locale .english_name )
152+ if not name :
153+ continue
154+ LANGUAGE_NAME_MAP [name .lower ()] = locale .language
155+ LANGUAGE_NAME_MAP = dict (sorted (LANGUAGE_NAME_MAP .items ()))
156+
157+
160158def normalize_license (licenseurl , license_mapping = None ):
161159 """Normalize licenseurl and map to standard license label."""
162160 if not isinstance (licenseurl , str ) or not licenseurl .strip ():
@@ -243,18 +241,26 @@ def normalize_language(raw_language):
243241
244242 # Prep for subsequent checks by striping noise and normalizing
245243 cleaned = normalize_key (strip_noise (raw ))
244+ if cleaned in LANGUAGE_NAME_MAP :
245+ cleaned = LANGUAGE_NAME_MAP [cleaned ]
246+
247+ # 2nd: check language alias map
248+ alias_map = {normalize_key (k ): v for k , v in LANGUAGE_ALIAS_MAP .items ()}
249+ if cleaned in alias_map :
250+ return alias_map [cleaned ]
251+
246252 for language in [raw , cleaned ]:
247253 if not language :
248254 continue
249255
250- # 2nd : check ISO639
256+ # 3rd : check ISO639
251257 try :
252258 name = iso639 .Language .match (language ).name
253259 return name
254260 except iso639 .language .LanguageNotFoundError :
255261 pass
256262
257- # 3rd : check Babel
263+ # 4th : check Babel
258264 try :
259265 language_locale = language .replace ("-" , "_" )
260266 locale = babel .Locale .parse (language_locale , sep = "_" )
@@ -265,11 +271,6 @@ def normalize_language(raw_language):
265271 except ValueError :
266272 pass
267273
268- # 4th: check language alias map
269- alias_map = {normalize_key (k ): v for k , v in LANGUAGE_ALIAS_MAP .items ()}
270- if cleaned in alias_map :
271- return alias_map [cleaned ]
272-
273274 return "Undetermined"
274275
275276
@@ -412,6 +413,8 @@ def main():
412413 )
413414
414415 license_mapping = load_license_mapping ()
416+ create_language_name_map ()
417+
415418 license_data , language_data = query_internet_archive (
416419 args , session , license_mapping
417420 )
0 commit comments