Skip to content

Commit 251b96d

Browse files
committed
cleanup strip_noise()
- move LANGUAGE_NOISE_WORDS to constants (renamed from noise_words) - use descriptive language variable (renamed from s) - tidy documentation
1 parent aa5aba8 commit 251b96d

File tree

1 file changed

+33
-31
lines changed

1 file changed

+33
-31
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,29 @@
9292
"русский": "Russian",
9393
"український": "Ukrainian",
9494
}
95+
LANGUAGE_NOISE_WORDS = [
96+
"-handwritten",
97+
"-spoken",
98+
"=",
99+
"english patch",
100+
"hand write",
101+
"hand written",
102+
"hand-written",
103+
"handwritten",
104+
"instrumental",
105+
"language",
106+
"no speech",
107+
"no spoken word",
108+
"no voice",
109+
"simple",
110+
"spoken",
111+
"sub-titles",
112+
"subbed",
113+
"subtitle",
114+
"subtitles?",
115+
"universal",
116+
"with subtitles?",
117+
]
95118
LIMIT_DEFAULT = 100000
96119
QUARTER = os.path.basename(PATHS["data_quarter"])
97120

@@ -208,45 +231,24 @@ def iso639_lookup(term):
208231
return None
209232

210233

211-
# strip common noise like "subtitles", "subtitle",
212-
# "(English)", "english patch", "handwritten", etc.
213-
def strip_noise(s):
234+
def strip_noise(language):
235+
"""
236+
Strip common noise like "subtitles", "subtitle", "(English)",
237+
"english patch", "handwritten", etc.
238+
"""
239+
214240
# Helper to find words with flexible boundaries
215241
def word_regex(word):
216242
return r"(\b|(?<=[\-_]))" + re.escape(word) + r"\b"
217243

218-
noise_words = [
219-
"-handwritten",
220-
"-spoken",
221-
"=",
222-
"english patch",
223-
"hand write",
224-
"hand written",
225-
"hand-written",
226-
"handwritten",
227-
"instrumental",
228-
"language",
229-
"no speech",
230-
"no spoken word",
231-
"no voice",
232-
"simple",
233-
"spoken",
234-
"sub-titles",
235-
"subbed",
236-
"subtitle",
237-
"subtitles?",
238-
"universal",
239-
"with subtitles?",
240-
]
241-
242244
# Combine all noise words into one regex
243-
combined_regex = r"|".join(word_regex(w) for w in noise_words)
245+
combined_regex = r"|".join(word_regex(w) for w in LANGUAGE_NOISE_WORDS)
244246

245-
s = re.sub(combined_regex, " ", s, flags=re.I)
247+
language = re.sub(combined_regex, " ", language, flags=re.I)
246248

247249
# Original regex for symbols
248-
s = re.sub(r"[()\"\']", " ", s)
249-
return s
250+
language = re.sub(r"[()\"\']", " ", language)
251+
return language
250252

251253

252254
def is_multi_language(raw_language):

0 commit comments

Comments
 (0)