Skip to content

Commit 7db3e8b

Browse files
authored
Merge pull request #7 from jessbryte/internet-archive-update
Language normalization and Internet Archive documentation
2 parents 92e7629 + 5b257a4 commit 7db3e8b

File tree

2 files changed

+180
-39
lines changed

2 files changed

+180
-39
lines changed

Pipfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ pillow = ">=11.3.0" # Ensure dependency is secure
1919
Pyarrow = "*"
2020
Pygments = "*"
2121
python-dotenv = "*"
22+
python-iso639 = "*"
2223
requests = ">=2.31.0"
2324
seaborn = "*"
2425
urllib3 = ">=2.5.0"

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 179 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@
1111
import sys
1212
import textwrap
1313
import traceback
14+
import unicodedata
1415
from collections import Counter
1516
from time import sleep
1617
from urllib.parse import urlparse
1718

1819
# Third-party
1920
from babel.core import Locale
2021
from internetarchive import ArchiveSession
22+
from iso639 import Language
2123
from pygments import highlight
2224
from pygments.formatters import TerminalFormatter
2325
from pygments.lexers import PythonTracebackLexer
@@ -32,7 +34,7 @@
3234
# Setup
3335
LOGGER, PATHS = shared.setup(__file__)
3436

35-
# Constants
37+
# CSV paths
3638
FILE1_COUNT = os.path.join(PATHS["data_phase"], "internetarchive_1_count.csv")
3739
FILE2_LANGUAGE = os.path.join(
3840
PATHS["data_phase"], "internetarchive_2_count_by_language.csv"
@@ -44,6 +46,8 @@
4446

4547
QUARTER = os.path.basename(PATHS["data_quarter"])
4648

49+
ISO639_CACHE = {}
50+
4751

4852
def parse_arguments():
4953
LOGGER.info("Parsing command-line options")
@@ -95,40 +99,6 @@ def load_license_mapping():
9599
return license_mapping
96100

97101

98-
def normalize_language(lang):
99-
try:
100-
# Pre-clean: lowercase, replace hyphens with underscores,
101-
# strip whitespace
102-
cleaned = lang.strip().lower().replace("-", "_")
103-
104-
# Special handling for common variants
105-
alias_map = {
106-
"english": "en",
107-
"english_handwritten": "en",
108-
"serbo_croatian": "sh", # ISO 639-1 code for Serbo-Croatian
109-
"zun": "zun", # Zuni — not in Babel, will fallback
110-
"ada": "ada", # Ada — not in Babel, will fallback
111-
"unknown": "UNKNOWN",
112-
}
113-
cleaned = alias_map.get(cleaned, cleaned)
114-
115-
# Custom name overrides for codes not supported by Babel
116-
custom_names = {
117-
"zun": "Zuni",
118-
"ada": "Adangme",
119-
"sh": "Serbo-Croatian",
120-
}
121-
122-
try:
123-
locale = Locale.parse(cleaned, sep="_")
124-
return locale.get_language_name("en")
125-
except Exception:
126-
return custom_names.get(cleaned, "UNKNOWN")
127-
128-
except Exception:
129-
return "UNKNOWN"
130-
131-
132102
def normalize_license(licenseurl, license_mapping=None):
133103
"""Normalize licenseurl and map to standard license label."""
134104
if not isinstance(licenseurl, str) or not licenseurl.strip():
@@ -157,6 +127,174 @@ def normalize_license(licenseurl, license_mapping=None):
157127
return label
158128

159129

130+
def normalize_key(s):
131+
"""Normalize string for dictionary keys:
132+
NFKD, remove diacritics, punctuation, collapse spaces, lowercase."""
133+
if not s:
134+
return ""
135+
s = str(s)
136+
s = unicodedata.normalize("NFKD", s)
137+
s = "".join(ch for ch in s if not unicodedata.combining(ch))
138+
s = re.sub(
139+
r"[^\w\s\+\-/]", " ", s, flags=re.UNICODE
140+
) # keep + / - for splits
141+
# s = re.sub(r"\s+", " ", s).strip().lower()
142+
return s
143+
144+
145+
def iso639_lookup(term):
146+
"""Return a Language object or None; cache results.
147+
Accepts raw user input."""
148+
if not term:
149+
return None
150+
key = term.strip().lower()
151+
if key in ISO639_CACHE:
152+
return ISO639_CACHE[key]
153+
try:
154+
result = Language.match(term, exact=False)
155+
except Exception:
156+
result = None
157+
# result normalization: pick first if list-like
158+
lang = None
159+
if result:
160+
if isinstance(result, (list, tuple)):
161+
lang = result[0] if result else None
162+
else:
163+
lang = result
164+
ISO639_CACHE[key] = lang
165+
return lang
166+
167+
168+
# strip common noise like "subtitles", "subtitle",
169+
# "(English)", "english patch", "handwritten"
170+
def strip_noise(s):
171+
s = re.sub(
172+
r"\b(subtitles?|subtitle|sub-titles|subbed|with subtitles?)\b",
173+
" ",
174+
s,
175+
flags=re.I,
176+
)
177+
s = re.sub(r"\b(english patch|english patch\))\b", " ", s, flags=re.I)
178+
s = re.sub(
179+
r"\b(handwritten|hand write|hand-written|hand written)\b",
180+
" ",
181+
s,
182+
flags=re.I,
183+
)
184+
s = re.sub(
185+
r"\b(no voice|no spoken word|no speech|instrumental)\b",
186+
" ",
187+
s,
188+
flags=re.I,
189+
)
190+
s = re.sub(r"[()\"\']", " ", s)
191+
return s
192+
193+
194+
def is_multi_language(raw_language):
195+
"""Detects multi-language strings."""
196+
return bool(
197+
re.search(
198+
r",|;|\band\b|\bwith\b|\/|&\s+", raw_language, flags=re.IGNORECASE
199+
)
200+
)
201+
202+
203+
def normalize_language(raw_language):
204+
if not raw_language:
205+
return "Undetermined"
206+
207+
raw = str(raw_language).strip()
208+
if is_multi_language(raw):
209+
# LOGGER.info("Multi-language detected: %s → raw)
210+
return "Multiple languages"
211+
212+
# strip noise and normalize (subtitles, parentheticals)
213+
cleaned_for_match = strip_noise(raw)
214+
cleaned = normalize_key(cleaned_for_match.replace("-", " "))
215+
216+
ALIAS_MAP = {
217+
"english": "English",
218+
"engrish": "English",
219+
"english_handwritten": "English",
220+
"enlgish": "English",
221+
"american english": "English",
222+
"en_us": "English",
223+
"en_es": "English",
224+
"Eglish": "English",
225+
"English (US)": "English",
226+
"sgn": "Sign languages",
227+
"русский": "Russian",
228+
"france": "French",
229+
"français": "French",
230+
"francais": "French",
231+
"italiano": "Italian",
232+
"ilokano": "Ilokano",
233+
"viẹetnamese": "Vietnamese",
234+
"português": "Portuguese",
235+
"pt-br": "Portuguese",
236+
"espanol": "Spanish",
237+
"español": "Spanish",
238+
"castellano": "Spanish",
239+
"es_formal": "Spanish",
240+
"es_es": "Spanish",
241+
"mandarin": "Chinese",
242+
"nederlands": "Dutch",
243+
"dutch": "Dutch",
244+
"swahili": "Swahili",
245+
"no language (english)": "Undetermined",
246+
"whatever we play it to be": "Undetermined",
247+
"english & chinese subbed": "Multiple languages",
248+
"mis": "Uncoded languages",
249+
"n/a": "Undetermined",
250+
"none": "Undetermined",
251+
"und": "Undetermined",
252+
"unknown": "Undetermined",
253+
"und": "Undetermined",
254+
"no language (english)": "Undetermined",
255+
"no speech": "Undetermined",
256+
"no spoken language": "Undetermined",
257+
"multi": "Multiple Languages",
258+
"multilanguage": "Multiple languages",
259+
"multiple": "Multiple Languages",
260+
"music": "Undetermined",
261+
}
262+
ALIAS_MAP = {normalize_key(k): v for k, v in ALIAS_MAP.items()}
263+
264+
# Use normalized ALIAS_MAP
265+
if cleaned in ALIAS_MAP:
266+
return ALIAS_MAP[cleaned]
267+
268+
# Try python-iso639
269+
lang = iso639_lookup(cleaned)
270+
if lang:
271+
# Returning English name;
272+
# fallback to alpha2 or alpha3 if name missing
273+
name = getattr(lang, "name", None)
274+
if name:
275+
return name
276+
if getattr(lang, "alpha2", None):
277+
return lang.alpha2
278+
if getattr(lang, "alpha3", None):
279+
return lang.alpha3
280+
281+
# if looks like 2 or 3-letter code fallback, ask iso639
282+
if re.fullmatch(r"[a-z]{2,3}", cleaned):
283+
lang_obj = iso639_lookup(cleaned)
284+
if lang_obj and getattr(lang_obj, "name", None):
285+
return lang_obj.name
286+
287+
try:
288+
locale = Locale.parse(cleaned, sep="_")
289+
eng = locale.get_language_name("en")
290+
if eng:
291+
return eng
292+
except Exception:
293+
pass
294+
295+
return "Undetermined"
296+
297+
160298
def query_internet_archive(args):
161299
license_counter = Counter()
162300
language_counter = Counter()
@@ -167,7 +305,7 @@ def query_internet_archive(args):
167305
query = "creativecommons.org"
168306
license_mapping = load_license_mapping()
169307

170-
rows = 50
308+
rows = 1000000
171309
total_rows = 0
172310
total_processed = 0
173311
max_retries = 3
@@ -224,12 +362,14 @@ def query_internet_archive(args):
224362
continue # Skip this result
225363

226364
# Extract and normalize language
227-
raw_language = result.get("language", "UNKNOWN")
365+
raw_language = result.get("language", "Undetermined")
228366
if isinstance(raw_language, list):
229-
raw_language = raw_language[0] if raw_language else "UNKNOWN"
367+
raw_language = (
368+
raw_language[0] if raw_language else "Undetermined"
369+
)
230370

231371
normalized_lang = normalize_language(raw_language)
232-
if normalized_lang == "UNKNOWN":
372+
if normalized_lang == "Undetermined":
233373
unmapped_language_counter[raw_language] += 1
234374
continue # Skip this result
235375

0 commit comments

Comments
 (0)