Skip to content

Commit f5fc128

Browse files
committed
add LANGUAGE_NAME_MAP and improve normalize_license flow
1 parent 1171ff7 commit f5fc128

File tree

1 file changed

+30
-27
lines changed

1 file changed

+30
-27
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -39,15 +39,12 @@
3939
)
4040
HEADER1 = ["LICENSE", "COUNT"]
4141
HEADER2 = ["LICENSE", "LANGUAGE", "COUNT"]
42-
ISO639_CACHE = {}
4342
LANGUAGE_ALIAS_MAP = {
4443
"american english": "English",
4544
"american": "English",
4645
"anglais": "English",
47-
"bosanski": "Bosnian",
4846
"castellano": "Spanish",
4947
"chinese sub": "Chinese",
50-
"deutsch": "German",
5148
"egligh": "English",
5249
"eglish": "English",
5350
"en_us es_es": "Multiple languages",
@@ -57,41 +54,25 @@
5754
"english_handwritten": "English",
5855
"engrish": "English",
5956
"enlgish": "English",
60-
"espanol": "Spanish",
61-
"francais": "French",
6257
"france": "French",
63-
"greek": "Greek",
6458
"hwbrew": "Hebrew",
6559
"ilokano": "Ilokano",
6660
"indian english": "English",
67-
"italiano": "Italian",
6861
"mandarin": "Chinese",
6962
"multi": "Multiple Languages",
7063
"multilanguage": "Multiple languages",
7164
"multiple": "Multiple Languages",
72-
"music": "Undetermined",
73-
"n/a": "Undetermined",
74-
"nederlands": "Dutch",
75-
"no language (english)": "Undetermined",
76-
"no speech": "Undetermined",
77-
"no spoken language": "Undetermined",
78-
"none": "Undetermined",
7965
"polska": "Polish",
8066
"português e espanhol": "Multiple languages",
81-
"português": "Portuguese",
82-
"pt_br": "Portuguese",
8367
"sgn": "Sign languages",
8468
"spain": "Spanish",
85-
"swahili": "Swahili",
8669
"uk english": "English",
87-
"unknown": "Undetermined",
8870
"us english": "English",
8971
"us-en": "English",
9072
"viẹetnamese": "Vietnamese",
91-
"whatever we play it to be": "Undetermined",
92-
"русский": "Russian",
9373
"український": "Ukrainian",
9474
}
75+
LANGUAGE_NAME_MAP = {} # Populated by create_language_name_map()
9576
LANGUAGE_NOISE_WORDS = [
9677
"-handwritten",
9778
"-spoken",
@@ -157,6 +138,23 @@ def load_license_mapping():
157138
return license_mapping
158139

159140

141+
def create_language_name_map():
142+
global LANGUAGE_NAME_MAP
143+
for locale_code in babel.localedata.locale_identifiers():
144+
locale = babel.Locale.parse(locale_code)
145+
# Localized/native language name
146+
name = normalize_key(locale.display_name)
147+
if not name:
148+
continue
149+
LANGUAGE_NAME_MAP[name.lower()] = locale.language
150+
# English language name
151+
name = normalize_key(locale.english_name)
152+
if not name:
153+
continue
154+
LANGUAGE_NAME_MAP[name.lower()] = locale.language
155+
LANGUAGE_NAME_MAP = dict(sorted(LANGUAGE_NAME_MAP.items()))
156+
157+
160158
def normalize_license(licenseurl, license_mapping=None):
161159
"""Normalize licenseurl and map to standard license label."""
162160
if not isinstance(licenseurl, str) or not licenseurl.strip():
@@ -243,18 +241,26 @@ def normalize_language(raw_language):
243241

244242
# Prep for subsequent checks by striping noise and normalizing
245243
cleaned = normalize_key(strip_noise(raw))
244+
if cleaned in LANGUAGE_NAME_MAP:
245+
cleaned = LANGUAGE_NAME_MAP[cleaned]
246+
247+
# 2nd: check language alias map
248+
alias_map = {normalize_key(k): v for k, v in LANGUAGE_ALIAS_MAP.items()}
249+
if cleaned in alias_map:
250+
return alias_map[cleaned]
251+
246252
for language in [raw, cleaned]:
247253
if not language:
248254
continue
249255

250-
# 2nd: check ISO639
256+
# 3rd: check ISO639
251257
try:
252258
name = iso639.Language.match(language).name
253259
return name
254260
except iso639.language.LanguageNotFoundError:
255261
pass
256262

257-
# 3rd: check Babel
263+
# 4th: check Babel
258264
try:
259265
language_locale = language.replace("-", "_")
260266
locale = babel.Locale.parse(language_locale, sep="_")
@@ -265,11 +271,6 @@ def normalize_language(raw_language):
265271
except ValueError:
266272
pass
267273

268-
# 4th: check language alias map
269-
alias_map = {normalize_key(k): v for k, v in LANGUAGE_ALIAS_MAP.items()}
270-
if cleaned in alias_map:
271-
return alias_map[cleaned]
272-
273274
return "Undetermined"
274275

275276

@@ -412,6 +413,8 @@ def main():
412413
)
413414

414415
license_mapping = load_license_mapping()
416+
create_language_name_map()
417+
415418
license_data, language_data = query_internet_archive(
416419
args, session, license_mapping
417420
)

0 commit comments

Comments
 (0)