Skip to content

Commit 1171ff7

Browse files
committed
switch to python-iso639
1 parent 251b96d commit 1171ff7

File tree

3 files changed

+80
-98
lines changed

3 files changed

+80
-98
lines changed

Pipfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ GitPython = "*"
1010
google-api-python-client = "*"
1111
h11 = ">=0.16.0" # Ensure dependency is secure
1212
internetarchive = ">=5.5.1"
13-
iso639-lang = "*"
13+
python-iso639 = "*"
1414
jupyterlab = ">=3.6.7"
1515
matplotlib = "*"
1616
numpy = "*"

Pipfile.lock

Lines changed: 59 additions & 52 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
from urllib.parse import urlparse
1717

1818
# Third-party
19-
from babel import Locale
19+
import babel
20+
import iso639
2021
from internetarchive import ArchiveSession
21-
from iso639 import Lang
2222
from pygments import highlight
2323
from pygments.formatters import TerminalFormatter
2424
from pygments.lexers import PythonTracebackLexer
@@ -201,36 +201,6 @@ def normalize_key(s):
201201
return s.strip().lower()
202202

203203

204-
def iso639_lookup(term):
205-
"""Return a Language object or None;
206-
cache results.
207-
Accepts raw user input."""
208-
if not term:
209-
return None
210-
key = term.strip().lower().replace("_", "-")
211-
if key in ISO639_CACHE:
212-
return ISO639_CACHE[key]
213-
214-
# Try direct code match
215-
try:
216-
result = Lang(key)
217-
ISO639_CACHE[key] = result
218-
return result
219-
except Exception:
220-
pass
221-
222-
# fallback to title-case name lookup
223-
try:
224-
result = Lang(term.strip().title())
225-
if result:
226-
ISO639_CACHE[key] = result
227-
return result
228-
except Exception:
229-
pass
230-
231-
return None
232-
233-
234204
def strip_noise(language):
235205
"""
236206
Strip common noise like "subtitles", "subtitle", "(English)",
@@ -264,7 +234,7 @@ def is_multi_language(raw_language):
264234

265235
def normalize_language(raw_language):
266236
raw = str(raw_language).strip()
267-
if not raw_language:
237+
if not raw:
268238
return "Undetermined"
269239

270240
# 1st: check multi-language
@@ -273,21 +243,26 @@ def normalize_language(raw_language):
273243

274244
# Prep for subsequent checks by striping noise and normalizing
275245
cleaned = normalize_key(strip_noise(raw))
246+
for language in [raw, cleaned]:
247+
if not language:
248+
continue
276249

277-
# 2nd: check ISO639
278-
lang_obj = iso639_lookup(raw) or iso639_lookup(cleaned)
279-
if lang_obj and getattr(lang_obj, "name", None):
280-
return lang_obj.name
250+
# 2nd: check ISO639
251+
try:
252+
name = iso639.Language.match(language).name
253+
return name
254+
except iso639.language.LanguageNotFoundError:
255+
pass
281256

282-
# 3rd: check Babel
283-
for cand in [raw, cleaned]:
284-
if not cand:
285-
continue
257+
# 3rd: check Babel
286258
try:
287-
cand_locale = cand.replace("-", "_")
288-
locale = Locale.parse(cand_locale, sep="_")
289-
return locale.get_language_name("en")
290-
except Exception:
259+
language_locale = language.replace("-", "_")
260+
locale = babel.Locale.parse(language_locale, sep="_")
261+
name = locale.get_language_name("en")
262+
return name
263+
except babel.core.UnknownLocaleError:
264+
pass
265+
except ValueError:
291266
pass
292267

293268
# 4th: check language alias map

0 commit comments

Comments
 (0)