@@ -167,27 +167,27 @@ def word_regex(word):
167167 return r"(\b|(?<=[\-_]))" + re .escape (word ) + r"\b"
168168
169169 noise_words = [
170- "subtitles?" ,
171- "subtitle" ,
172- "sub-titles" ,
173- "subbed" ,
174- "with subtitles?" ,
170+ "-handwritten" ,
171+ "-spoken" ,
172+ "=" ,
175173 "english patch" ,
176- "handwritten" ,
177174 "hand write" ,
178- "hand-written" ,
179175 "hand written" ,
180- "-handwritten" ,
181- "no voice" ,
182- "no spoken word" ,
183- "no speech" ,
176+ "hand-written" ,
177+ "handwritten" ,
184178 "instrumental" ,
185- "universal" ,
186179 "language" ,
187- "=" ,
180+ "no speech" ,
181+ "no spoken word" ,
182+ "no voice" ,
188183 "simple" ,
189184 "spoken" ,
190- "-spoken" ,
185+ "sub-titles" ,
186+ "subbed" ,
187+ "subtitle" ,
188+ "subtitles?" ,
189+ "universal" ,
190+ "with subtitles?" ,
191191 ]
192192
193193 # Combine all noise words into one regex
@@ -242,56 +242,56 @@ def normalize_language(raw_language):
242242
243243 # --- Try Alias Map ---
244244 ALIAS_MAP = {
245- "engrish" : "English" ,
246- "english_handwritten" : "English" ,
247- "enlgish" : "English" ,
248245 "american english" : "English" ,
249- "english - american" : "English" ,
250246 "american" : "English" ,
251- "uk english" : "English" ,
252- "eglish" : "English" ,
253- "egligh" : "English" ,
254- "english (us)" : "English" ,
255- "us-en" : "English" ,
256- "sgn" : "Sign languages" ,
257247 "anglais" : "English" ,
258- "us english" : "English" ,
259- "indian english" : "English" ,
260- "hwbrew" : "Hebrew" ,
261- "polska" : "Polish" ,
262248 "bosanski" : "Bosnian" ,
263- "український " : "Ukrainian " ,
249+ "castellano " : "Spanish " ,
264250 "chinese sub" : "Chinese" ,
265- "spain" : "Spanish" ,
266- "português e espanhol" : "Multiple languages" ,
267- "русский" : "Russian" ,
268251 "deutsch" : "German" ,
269- "france" : "French" ,
270- "francais" : "French" ,
271- "italiano" : "Italian" ,
272- "ilokano" : "Ilokano" ,
273- "viẹetnamese" : "Vietnamese" ,
274- "português" : "Portuguese" ,
275- "pt_br" : "Portuguese" ,
252+ "egligh" : "English" ,
253+ "eglish" : "English" ,
254+ "en_us es_es" : "Multiple languages" ,
255+ "english & chinese subbed" : "Multiple languages" ,
256+ "english (us)" : "English" ,
257+ "english - american" : "English" ,
258+ "english_handwritten" : "English" ,
259+ "engrish" : "English" ,
260+ "enlgish" : "English" ,
276261 "espanol" : "Spanish" ,
277- "castellano" : "Spanish" ,
262+ "francais" : "French" ,
263+ "france" : "French" ,
278264 "greek" : "Greek" ,
265+ "hwbrew" : "Hebrew" ,
266+ "ilokano" : "Ilokano" ,
267+ "indian english" : "English" ,
268+ "italiano" : "Italian" ,
279269 "mandarin" : "Chinese" ,
280- "nederlands" : "Dutch" ,
281- "swahili" : "Swahili" ,
282- "no language (english)" : "Undetermined" ,
283- "whatever we play it to be" : "Undetermined" ,
284- "en_us es_es" : "Multiple languages" ,
285- "english & chinese subbed" : "Multiple languages" ,
286- "n/a" : "Undetermined" ,
287- "none" : "Undetermined" ,
288- "unknown" : "Undetermined" ,
289- "no speech" : "Undetermined" ,
290- "no spoken language" : "Undetermined" ,
291270 "multi" : "Multiple Languages" ,
292271 "multilanguage" : "Multiple languages" ,
293272 "multiple" : "Multiple Languages" ,
294273 "music" : "Undetermined" ,
274+ "n/a" : "Undetermined" ,
275+ "nederlands" : "Dutch" ,
276+ "no language (english)" : "Undetermined" ,
277+ "no speech" : "Undetermined" ,
278+ "no spoken language" : "Undetermined" ,
279+ "none" : "Undetermined" ,
280+ "polska" : "Polish" ,
281+ "português e espanhol" : "Multiple languages" ,
282+ "português" : "Portuguese" ,
283+ "pt_br" : "Portuguese" ,
284+ "sgn" : "Sign languages" ,
285+ "spain" : "Spanish" ,
286+ "swahili" : "Swahili" ,
287+ "uk english" : "English" ,
288+ "unknown" : "Undetermined" ,
289+ "us english" : "English" ,
290+ "us-en" : "English" ,
291+ "viẹetnamese" : "Vietnamese" ,
292+ "whatever we play it to be" : "Undetermined" ,
293+ "русский" : "Russian" ,
294+ "український" : "Ukrainian" ,
295295 }
296296 ALIAS_MAP = {normalize_key (k ): v for k , v in ALIAS_MAP .items ()}
297297
0 commit comments