|
92 | 92 | "русский": "Russian", |
93 | 93 | "український": "Ukrainian", |
94 | 94 | } |
| 95 | +LANGUAGE_NOISE_WORDS = [ |
| 96 | + "-handwritten", |
| 97 | + "-spoken", |
| 98 | + "=", |
| 99 | + "english patch", |
| 100 | + "hand write", |
| 101 | + "hand written", |
| 102 | + "hand-written", |
| 103 | + "handwritten", |
| 104 | + "instrumental", |
| 105 | + "language", |
| 106 | + "no speech", |
| 107 | + "no spoken word", |
| 108 | + "no voice", |
| 109 | + "simple", |
| 110 | + "spoken", |
| 111 | + "sub-titles", |
| 112 | + "subbed", |
| 113 | + "subtitle", |
| 114 | + "subtitles?", |
| 115 | + "universal", |
| 116 | + "with subtitles?", |
| 117 | +] |
95 | 118 | LIMIT_DEFAULT = 100000 |
96 | 119 | QUARTER = os.path.basename(PATHS["data_quarter"]) |
97 | 120 |
|
@@ -208,45 +231,24 @@ def iso639_lookup(term): |
208 | 231 | return None |
209 | 232 |
|
210 | 233 |
|
211 | | -# strip common noise like "subtitles", "subtitle", |
212 | | -# "(English)", "english patch", "handwritten", etc. |
213 | | -def strip_noise(s): |
| 234 | +def strip_noise(language): |
| 235 | + """ |
| 236 | + Strip common noise like "subtitles", "subtitle", "(English)", |
| 237 | + "english patch", "handwritten", etc. |
| 238 | + """ |
| 239 | + |
214 | 240 | # Helper to find words with flexible boundaries |
215 | 241 | def word_regex(word): |
216 | 242 | return r"(\b|(?<=[\-_]))" + re.escape(word) + r"\b" |
217 | 243 |
|
218 | | - noise_words = [ |
219 | | - "-handwritten", |
220 | | - "-spoken", |
221 | | - "=", |
222 | | - "english patch", |
223 | | - "hand write", |
224 | | - "hand written", |
225 | | - "hand-written", |
226 | | - "handwritten", |
227 | | - "instrumental", |
228 | | - "language", |
229 | | - "no speech", |
230 | | - "no spoken word", |
231 | | - "no voice", |
232 | | - "simple", |
233 | | - "spoken", |
234 | | - "sub-titles", |
235 | | - "subbed", |
236 | | - "subtitle", |
237 | | - "subtitles?", |
238 | | - "universal", |
239 | | - "with subtitles?", |
240 | | - ] |
241 | | - |
242 | 244 | # Combine all noise words into one regex |
243 | | - combined_regex = r"|".join(word_regex(w) for w in noise_words) |
| 245 | + combined_regex = r"|".join(word_regex(w) for w in LANGUAGE_NOISE_WORDS) |
244 | 246 |
|
245 | | - s = re.sub(combined_regex, " ", s, flags=re.I) |
| 247 | + language = re.sub(combined_regex, " ", language, flags=re.I) |
246 | 248 |
|
247 | 249 | # Original regex for symbols |
248 | | - s = re.sub(r"[()\"\']", " ", s) |
249 | | - return s |
| 250 | + language = re.sub(r"[()\"\']", " ", language) |
| 251 | + return language |
250 | 252 |
|
251 | 253 |
|
252 | 254 | def is_multi_language(raw_language): |
|
0 commit comments