|
| 1 | +import re |
| 2 | + |
1 | 3 | from iso639 import Language, LanguageNotFoundError |
2 | 4 | from pymediainfo import Track |
3 | 5 |
|
@@ -88,3 +90,54 @@ def get_language_obj(language_str: str) -> Language | None: |
88 | 90 | except LanguageNotFoundError: |
89 | 91 | return None |
90 | 92 | return None |
| 93 | + |
| 94 | + |
| 95 | +def detect_language_from_filename(filename: str) -> Language | None: |
| 96 | + """Detect language code from filename by searching for common patterns. |
| 97 | +
|
| 98 | + Looks for ISO 639-1 (2-letter), ISO 639-2 (3-letter), and common language names. |
| 99 | + Prioritizes: naturally lowercase > naturally UPPERCASE > mixed case words |
| 100 | +
|
| 101 | + Args: |
| 102 | + filename (str): The filename to search for language codes |
| 103 | +
|
| 104 | + Returns: |
| 105 | + Language | None: Matched Language object or None |
| 106 | + """ |
| 107 | + # remove extension but keep original case |
| 108 | + name_no_ext = filename.rsplit(".", 1)[0] |
| 109 | + |
| 110 | + # common patterns: "movie.eng.srt", "movie_eng_sub.srt", "movie.en.srt", etc. |
| 111 | + # look for word boundaries or underscores/dots around language codes |
| 112 | + patterns = [ |
| 113 | + r"[._-]([a-zA-Z]{2,3})[._-]", # .eng. or _en_ or -jpn- |
| 114 | + r"[._-]([a-zA-Z]{2,3})$", # .eng or _en at end |
| 115 | + r"^([a-zA-Z]{2,3})[._-]", # eng. or en_ at start |
| 116 | + ] |
| 117 | + |
| 118 | + # collect all candidates with their case type |
| 119 | + candidates = [] |
| 120 | + for pattern in patterns: |
| 121 | + matches = re.finditer(pattern, name_no_ext) |
| 122 | + for match in matches: |
| 123 | + lang_code = match.group(1) |
| 124 | + # determine case type for priority |
| 125 | + if lang_code.islower(): |
| 126 | + priority = 0 # highest priority |
| 127 | + elif lang_code.isupper(): |
| 128 | + priority = 1 # medium priority |
| 129 | + else: |
| 130 | + priority = 2 # lowest priority (mixed case) |
| 131 | + candidates.append((priority, lang_code)) |
| 132 | + |
| 133 | + # sort by priority (lower number = higher priority) |
| 134 | + candidates.sort(key=lambda x: x[0]) |
| 135 | + |
| 136 | + # try to match in priority order |
| 137 | + for _, lang_code in candidates: |
| 138 | + try: |
| 139 | + return Language.match(lang_code.lower()) |
| 140 | + except LanguageNotFoundError: |
| 141 | + continue |
| 142 | + |
| 143 | + return None |
0 commit comments