Skip to content

Commit aa5aba8

Browse files
committed
clean-up normalize_language()
- move LANGUAGE_ALIAS_MAP to constants (renamed from ALIAS_MAP) - organize constants - clean-up normalize_language comments
1 parent ea6eed8 commit aa5aba8

File tree

1 file changed

+63
-68
lines changed

1 file changed

+63
-68
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 63 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,69 @@
3232
# Setup
3333
LOGGER, PATHS = shared.setup(__file__)
3434

35-
# CSV paths
35+
# Constants
3636
FILE1_COUNT = os.path.join(PATHS["data_phase"], "internetarchive_1_count.csv")
3737
FILE2_LANGUAGE = os.path.join(
3838
PATHS["data_phase"], "internetarchive_2_count_by_language.csv"
3939
)
40-
41-
# CSV headers
4240
HEADER1 = ["LICENSE", "COUNT"]
4341
HEADER2 = ["LICENSE", "LANGUAGE", "COUNT"]
42+
ISO639_CACHE = {}
43+
LANGUAGE_ALIAS_MAP = {
44+
"american english": "English",
45+
"american": "English",
46+
"anglais": "English",
47+
"bosanski": "Bosnian",
48+
"castellano": "Spanish",
49+
"chinese sub": "Chinese",
50+
"deutsch": "German",
51+
"egligh": "English",
52+
"eglish": "English",
53+
"en_us es_es": "Multiple languages",
54+
"english & chinese subbed": "Multiple languages",
55+
"english (us)": "English",
56+
"english - american": "English",
57+
"english_handwritten": "English",
58+
"engrish": "English",
59+
"enlgish": "English",
60+
"espanol": "Spanish",
61+
"francais": "French",
62+
"france": "French",
63+
"greek": "Greek",
64+
"hwbrew": "Hebrew",
65+
"ilokano": "Ilokano",
66+
"indian english": "English",
67+
"italiano": "Italian",
68+
"mandarin": "Chinese",
69+
"multi": "Multiple Languages",
70+
"multilanguage": "Multiple languages",
71+
"multiple": "Multiple Languages",
72+
"music": "Undetermined",
73+
"n/a": "Undetermined",
74+
"nederlands": "Dutch",
75+
"no language (english)": "Undetermined",
76+
"no speech": "Undetermined",
77+
"no spoken language": "Undetermined",
78+
"none": "Undetermined",
79+
"polska": "Polish",
80+
"português e espanhol": "Multiple languages",
81+
"português": "Portuguese",
82+
"pt_br": "Portuguese",
83+
"sgn": "Sign languages",
84+
"spain": "Spanish",
85+
"swahili": "Swahili",
86+
"uk english": "English",
87+
"unknown": "Undetermined",
88+
"us english": "English",
89+
"us-en": "English",
90+
"viẹetnamese": "Vietnamese",
91+
"whatever we play it to be": "Undetermined",
92+
"русский": "Russian",
93+
"український": "Ukrainian",
94+
}
4495
LIMIT_DEFAULT = 100000
4596
QUARTER = os.path.basename(PATHS["data_quarter"])
4697

47-
ISO639_CACHE = {}
48-
4998

5099
def parse_arguments():
51100
LOGGER.info("Parsing command-line options")
@@ -212,24 +261,23 @@ def is_multi_language(raw_language):
212261

213262

214263
def normalize_language(raw_language):
264+
raw = str(raw_language).strip()
215265
if not raw_language:
216266
return "Undetermined"
217267

218-
raw = str(raw_language).strip()
219-
220-
# check multi-language
268+
# 1st: check multi-language
221269
if is_multi_language(raw):
222270
return "Multiple languages"
223271

224-
# strip noise and normalize
272+
# Prep for subsequent checks by striping noise and normalizing
225273
cleaned = normalize_key(strip_noise(raw))
226274

227-
# --- Try ISO639 first ---
275+
# 2nd: check ISO639
228276
lang_obj = iso639_lookup(raw) or iso639_lookup(cleaned)
229277
if lang_obj and getattr(lang_obj, "name", None):
230278
return lang_obj.name
231279

232-
# Try Babel
280+
# 3rd: check Babel
233281
for cand in [raw, cleaned]:
234282
if not cand:
235283
continue
@@ -240,63 +288,10 @@ def normalize_language(raw_language):
240288
except Exception:
241289
pass
242290

243-
# --- Try Alias Map ---
244-
ALIAS_MAP = {
245-
"american english": "English",
246-
"american": "English",
247-
"anglais": "English",
248-
"bosanski": "Bosnian",
249-
"castellano": "Spanish",
250-
"chinese sub": "Chinese",
251-
"deutsch": "German",
252-
"egligh": "English",
253-
"eglish": "English",
254-
"en_us es_es": "Multiple languages",
255-
"english & chinese subbed": "Multiple languages",
256-
"english (us)": "English",
257-
"english - american": "English",
258-
"english_handwritten": "English",
259-
"engrish": "English",
260-
"enlgish": "English",
261-
"espanol": "Spanish",
262-
"francais": "French",
263-
"france": "French",
264-
"greek": "Greek",
265-
"hwbrew": "Hebrew",
266-
"ilokano": "Ilokano",
267-
"indian english": "English",
268-
"italiano": "Italian",
269-
"mandarin": "Chinese",
270-
"multi": "Multiple Languages",
271-
"multilanguage": "Multiple languages",
272-
"multiple": "Multiple Languages",
273-
"music": "Undetermined",
274-
"n/a": "Undetermined",
275-
"nederlands": "Dutch",
276-
"no language (english)": "Undetermined",
277-
"no speech": "Undetermined",
278-
"no spoken language": "Undetermined",
279-
"none": "Undetermined",
280-
"polska": "Polish",
281-
"português e espanhol": "Multiple languages",
282-
"português": "Portuguese",
283-
"pt_br": "Portuguese",
284-
"sgn": "Sign languages",
285-
"spain": "Spanish",
286-
"swahili": "Swahili",
287-
"uk english": "English",
288-
"unknown": "Undetermined",
289-
"us english": "English",
290-
"us-en": "English",
291-
"viẹetnamese": "Vietnamese",
292-
"whatever we play it to be": "Undetermined",
293-
"русский": "Russian",
294-
"український": "Ukrainian",
295-
}
296-
ALIAS_MAP = {normalize_key(k): v for k, v in ALIAS_MAP.items()}
297-
298-
if cleaned in ALIAS_MAP:
299-
return ALIAS_MAP[cleaned]
291+
# 4th: check language alias map
292+
alias_map = {normalize_key(k): v for k, v in LANGUAGE_ALIAS_MAP.items()}
293+
if cleaned in alias_map:
294+
return alias_map[cleaned]
300295

301296
return "Undetermined"
302297

0 commit comments

Comments
 (0)