1111import sys
1212import textwrap
1313import traceback
14+ import unicodedata
1415from collections import Counter
1516from time import sleep
1617from urllib .parse import urlparse
1718
1819# Third-party
1920from babel .core import Locale
2021from internetarchive import ArchiveSession
22+ from iso639 import Language
2123from pygments import highlight
2224from pygments .formatters import TerminalFormatter
2325from pygments .lexers import PythonTracebackLexer
3234# Setup
3335LOGGER , PATHS = shared .setup (__file__ )
3436
35- # Constants
37+ # CSV paths
3638FILE1_COUNT = os .path .join (PATHS ["data_phase" ], "internetarchive_1_count.csv" )
3739FILE2_LANGUAGE = os .path .join (
3840 PATHS ["data_phase" ], "internetarchive_2_count_by_language.csv"
4446
4547QUARTER = os .path .basename (PATHS ["data_quarter" ])
4648
49+ ISO639_CACHE = {}
50+
4751
4852def parse_arguments ():
4953 LOGGER .info ("Parsing command-line options" )
@@ -95,40 +99,6 @@ def load_license_mapping():
9599 return license_mapping
96100
97101
98- def normalize_language (lang ):
99- try :
100- # Pre-clean: lowercase, replace hyphens with underscores,
101- # strip whitespace
102- cleaned = lang .strip ().lower ().replace ("-" , "_" )
103-
104- # Special handling for common variants
105- alias_map = {
106- "english" : "en" ,
107- "english_handwritten" : "en" ,
108- "serbo_croatian" : "sh" , # ISO 639-1 code for Serbo-Croatian
109- "zun" : "zun" , # Zuni — not in Babel, will fallback
110- "ada" : "ada" , # Ada — not in Babel, will fallback
111- "unknown" : "UNKNOWN" ,
112- }
113- cleaned = alias_map .get (cleaned , cleaned )
114-
115- # Custom name overrides for codes not supported by Babel
116- custom_names = {
117- "zun" : "Zuni" ,
118- "ada" : "Adangme" ,
119- "sh" : "Serbo-Croatian" ,
120- }
121-
122- try :
123- locale = Locale .parse (cleaned , sep = "_" )
124- return locale .get_language_name ("en" )
125- except Exception :
126- return custom_names .get (cleaned , "UNKNOWN" )
127-
128- except Exception :
129- return "UNKNOWN"
130-
131-
132102def normalize_license (licenseurl , license_mapping = None ):
133103 """Normalize licenseurl and map to standard license label."""
134104 if not isinstance (licenseurl , str ) or not licenseurl .strip ():
@@ -157,6 +127,174 @@ def normalize_license(licenseurl, license_mapping=None):
157127 return label
158128
159129
130+ def normalize_key (s ):
131+ """Normalize string for dictionary keys:
132+ NFKD, remove diacritics, punctuation, collapse spaces, lowercase."""
133+ if not s :
134+ return ""
135+ s = str (s )
136+ s = unicodedata .normalize ("NFKD" , s )
137+ s = "" .join (ch for ch in s if not unicodedata .combining (ch ))
138+ s = re .sub (
139+ r"[^\w\s\+\-/]" , " " , s , flags = re .UNICODE
140+ ) # keep + / - for splits
141+ # s = re.sub(r"\s+", " ", s).strip().lower()
142+ return s
143+
144+
145+ def iso639_lookup (term ):
146+ """Return a Language object or None; cache results.
147+ Accepts raw user input."""
148+ if not term :
149+ return None
150+ key = term .strip ().lower ()
151+ if key in ISO639_CACHE :
152+ return ISO639_CACHE [key ]
153+ try :
154+ result = Language .match (term , exact = False )
155+ except Exception :
156+ result = None
157+ # result normalization: pick first if list-like
158+ lang = None
159+ if result :
160+ if isinstance (result , (list , tuple )):
161+ lang = result [0 ] if result else None
162+ else :
163+ lang = result
164+ ISO639_CACHE [key ] = lang
165+ return lang
166+
167+
168+ # strip common noise like "subtitles", "subtitle",
169+ # "(English)", "english patch", "handwritten"
170+ def strip_noise (s ):
171+ s = re .sub (
172+ r"\b(subtitles?|subtitle|sub-titles|subbed|with subtitles?)\b" ,
173+ " " ,
174+ s ,
175+ flags = re .I ,
176+ )
177+ s = re .sub (r"\b(english patch|english patch\))\b" , " " , s , flags = re .I )
178+ s = re .sub (
179+ r"\b(handwritten|hand write|hand-written|hand written)\b" ,
180+ " " ,
181+ s ,
182+ flags = re .I ,
183+ )
184+ s = re .sub (
185+ r"\b(no voice|no spoken word|no speech|instrumental)\b" ,
186+ " " ,
187+ s ,
188+ flags = re .I ,
189+ )
190+ s = re .sub (r"[()\"\']" , " " , s )
191+ return s
192+
193+
194+ def is_multi_language (raw_language ):
195+ """Detects multi-language strings."""
196+ return bool (
197+ re .search (
198+ r",|;|\band\b|\bwith\b|\/|&\s+" , raw_language , flags = re .IGNORECASE
199+ )
200+ )
201+
202+
203+ def normalize_language (raw_language ):
204+ if not raw_language :
205+ return "Undetermined"
206+
207+ raw = str (raw_language ).strip ()
208+ if is_multi_language (raw ):
209+ # LOGGER.info("Multi-language detected: %s → raw)
210+ return "Multiple languages"
211+
212+ # strip noise and normalize (subtitles, parentheticals)
213+ cleaned_for_match = strip_noise (raw )
214+ cleaned = normalize_key (cleaned_for_match .replace ("-" , " " ))
215+
216+ ALIAS_MAP = {
217+ "english" : "English" ,
218+ "engrish" : "English" ,
219+ "english_handwritten" : "English" ,
220+ "enlgish" : "English" ,
221+ "american english" : "English" ,
222+ "en_us" : "English" ,
223+ "en_es" : "English" ,
224+ "Eglish" : "English" ,
225+ "English (US)" : "English" ,
226+ "sgn" : "Sign languages" ,
227+ "русский" : "Russian" ,
228+ "france" : "French" ,
229+ "français" : "French" ,
230+ "francais" : "French" ,
231+ "italiano" : "Italian" ,
232+ "ilokano" : "Ilokano" ,
233+ "viẹetnamese" : "Vietnamese" ,
234+ "português" : "Portuguese" ,
235+ "pt-br" : "Portuguese" ,
236+ "espanol" : "Spanish" ,
237+ "español" : "Spanish" ,
238+ "castellano" : "Spanish" ,
239+ "es_formal" : "Spanish" ,
240+ "es_es" : "Spanish" ,
241+ "mandarin" : "Chinese" ,
242+ "nederlands" : "Dutch" ,
243+ "dutch" : "Dutch" ,
244+ "swahili" : "Swahili" ,
245+ "no language (english)" : "Undetermined" ,
246+ "whatever we play it to be" : "Undetermined" ,
247+ "english & chinese subbed" : "Multiple languages" ,
248+ "mis" : "Uncoded languages" ,
249+ "n/a" : "Undetermined" ,
250+ "none" : "Undetermined" ,
251+ "und" : "Undetermined" ,
252+ "unknown" : "Undetermined" ,
253+ "und" : "Undetermined" ,
254+ "no language (english)" : "Undetermined" ,
255+ "no speech" : "Undetermined" ,
256+ "no spoken language" : "Undetermined" ,
257+ "multi" : "Multiple Languages" ,
258+ "multilanguage" : "Multiple languages" ,
259+ "multiple" : "Multiple Languages" ,
260+ "music" : "Undetermined" ,
261+ }
262+ ALIAS_MAP = {normalize_key (k ): v for k , v in ALIAS_MAP .items ()}
263+
264+ # Use normalized ALIAS_MAP
265+ if cleaned in ALIAS_MAP :
266+ return ALIAS_MAP [cleaned ]
267+
268+ # Try python-iso639
269+ lang = iso639_lookup (cleaned )
270+ if lang :
271+ # Returning English name;
272+ # fallback to alpha2 or alpha3 if name missing
273+ name = getattr (lang , "name" , None )
274+ if name :
275+ return name
276+ if getattr (lang , "alpha2" , None ):
277+ return lang .alpha2
278+ if getattr (lang , "alpha3" , None ):
279+ return lang .alpha3
280+
281+ # if looks like 2 or 3-letter code fallback, ask iso639
282+ if re .fullmatch (r"[a-z]{2,3}" , cleaned ):
283+ lang_obj = iso639_lookup (cleaned )
284+ if lang_obj and getattr (lang_obj , "name" , None ):
285+ return lang_obj .name
286+
287+ try :
288+ locale = Locale .parse (cleaned , sep = "_" )
289+ eng = locale .get_language_name ("en" )
290+ if eng :
291+ return eng
292+ except Exception :
293+ pass
294+
295+ return "Undetermined"
296+
297+
160298def query_internet_archive (args ):
161299 license_counter = Counter ()
162300 language_counter = Counter ()
@@ -167,7 +305,7 @@ def query_internet_archive(args):
167305 query = "creativecommons.org"
168306 license_mapping = load_license_mapping ()
169307
170- rows = 50
308+ rows = 1000000
171309 total_rows = 0
172310 total_processed = 0
173311 max_retries = 3
@@ -224,12 +362,14 @@ def query_internet_archive(args):
224362 continue # Skip this result
225363
226364 # Extract and normalize language
227- raw_language = result .get ("language" , "UNKNOWN " )
365+ raw_language = result .get ("language" , "Undetermined " )
228366 if isinstance (raw_language , list ):
229- raw_language = raw_language [0 ] if raw_language else "UNKNOWN"
367+ raw_language = (
368+ raw_language [0 ] if raw_language else "Undetermined"
369+ )
230370
231371 normalized_lang = normalize_language (raw_language )
232- if normalized_lang == "UNKNOWN " :
372+ if normalized_lang == "Undetermined " :
233373 unmapped_language_counter [raw_language ] += 1
234374 continue # Skip this result
235375
0 commit comments