Skip to content

Commit ea6eed8

Browse files
committed
sort noise words and alias map
1 parent 11283cd commit ea6eed8

File tree

1 file changed

+51
-51
lines changed

1 file changed

+51
-51
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -167,27 +167,27 @@ def word_regex(word):
167167
return r"(\b|(?<=[\-_]))" + re.escape(word) + r"\b"
168168

169169
noise_words = [
170-
"subtitles?",
171-
"subtitle",
172-
"sub-titles",
173-
"subbed",
174-
"with subtitles?",
170+
"-handwritten",
171+
"-spoken",
172+
"=",
175173
"english patch",
176-
"handwritten",
177174
"hand write",
178-
"hand-written",
179175
"hand written",
180-
"-handwritten",
181-
"no voice",
182-
"no spoken word",
183-
"no speech",
176+
"hand-written",
177+
"handwritten",
184178
"instrumental",
185-
"universal",
186179
"language",
187-
"=",
180+
"no speech",
181+
"no spoken word",
182+
"no voice",
188183
"simple",
189184
"spoken",
190-
"-spoken",
185+
"sub-titles",
186+
"subbed",
187+
"subtitle",
188+
"subtitles?",
189+
"universal",
190+
"with subtitles?",
191191
]
192192

193193
# Combine all noise words into one regex
@@ -242,56 +242,56 @@ def normalize_language(raw_language):
242242

243243
# --- Try Alias Map ---
244244
ALIAS_MAP = {
245-
"engrish": "English",
246-
"english_handwritten": "English",
247-
"enlgish": "English",
248245
"american english": "English",
249-
"english - american": "English",
250246
"american": "English",
251-
"uk english": "English",
252-
"eglish": "English",
253-
"egligh": "English",
254-
"english (us)": "English",
255-
"us-en": "English",
256-
"sgn": "Sign languages",
257247
"anglais": "English",
258-
"us english": "English",
259-
"indian english": "English",
260-
"hwbrew": "Hebrew",
261-
"polska": "Polish",
262248
"bosanski": "Bosnian",
263-
"український": "Ukrainian",
249+
"castellano": "Spanish",
264250
"chinese sub": "Chinese",
265-
"spain": "Spanish",
266-
"português e espanhol": "Multiple languages",
267-
"русский": "Russian",
268251
"deutsch": "German",
269-
"france": "French",
270-
"francais": "French",
271-
"italiano": "Italian",
272-
"ilokano": "Ilokano",
273-
"viẹetnamese": "Vietnamese",
274-
"português": "Portuguese",
275-
"pt_br": "Portuguese",
252+
"egligh": "English",
253+
"eglish": "English",
254+
"en_us es_es": "Multiple languages",
255+
"english & chinese subbed": "Multiple languages",
256+
"english (us)": "English",
257+
"english - american": "English",
258+
"english_handwritten": "English",
259+
"engrish": "English",
260+
"enlgish": "English",
276261
"espanol": "Spanish",
277-
"castellano": "Spanish",
262+
"francais": "French",
263+
"france": "French",
278264
"greek": "Greek",
265+
"hwbrew": "Hebrew",
266+
"ilokano": "Ilokano",
267+
"indian english": "English",
268+
"italiano": "Italian",
279269
"mandarin": "Chinese",
280-
"nederlands": "Dutch",
281-
"swahili": "Swahili",
282-
"no language (english)": "Undetermined",
283-
"whatever we play it to be": "Undetermined",
284-
"en_us es_es": "Multiple languages",
285-
"english & chinese subbed": "Multiple languages",
286-
"n/a": "Undetermined",
287-
"none": "Undetermined",
288-
"unknown": "Undetermined",
289-
"no speech": "Undetermined",
290-
"no spoken language": "Undetermined",
291270
"multi": "Multiple Languages",
292271
"multilanguage": "Multiple languages",
293272
"multiple": "Multiple Languages",
294273
"music": "Undetermined",
274+
"n/a": "Undetermined",
275+
"nederlands": "Dutch",
276+
"no language (english)": "Undetermined",
277+
"no speech": "Undetermined",
278+
"no spoken language": "Undetermined",
279+
"none": "Undetermined",
280+
"polska": "Polish",
281+
"português e espanhol": "Multiple languages",
282+
"português": "Portuguese",
283+
"pt_br": "Portuguese",
284+
"sgn": "Sign languages",
285+
"spain": "Spanish",
286+
"swahili": "Swahili",
287+
"uk english": "English",
288+
"unknown": "Undetermined",
289+
"us english": "English",
290+
"us-en": "English",
291+
"viẹetnamese": "Vietnamese",
292+
"whatever we play it to be": "Undetermined",
293+
"русский": "Russian",
294+
"український": "Ukrainian",
295295
}
296296
ALIAS_MAP = {normalize_key(k): v for k, v in ALIAS_MAP.items()}
297297

0 commit comments

Comments
 (0)