Skip to content

Commit 198bf02

Browse files
authored
Merge pull request #8 from LavX/ai_translate
fix(subtitles): improve language detection in batch translate
2 parents 7a25a59 + 2570b57 commit 198bf02

File tree

1 file changed

+140
-68
lines changed

1 file changed

+140
-68
lines changed

bazarr/api/subtitles/batch_translate.py

Lines changed: 140 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,9 @@ def _process_episode(self, item, source_language, target_language, forced, hi, s
128128

129129
# Find source subtitle
130130
source_subtitle_path = subtitle_path
131+
detected_source_lang = None
131132
if not source_subtitle_path:
132-
source_subtitle_path = self._find_subtitle_by_language(
133+
source_subtitle_path, detected_source_lang = self._find_subtitle_by_language(
133134
episode.subtitles, source_language, video_path, media_type='series'
134135
)
135136

@@ -139,6 +140,10 @@ def _process_episode(self, item, source_language, target_language, forced, hi, s
139140
'error': f'No subtitle found for episode {sonarr_episode_id} (requested source: {source_language})'
140141
}
141142

143+
# Use detected language if available
144+
if detected_source_lang:
145+
source_language = detected_source_lang
146+
142147
# Queue translation
143148
try:
144149
result = translate_subtitles_file(
@@ -178,8 +183,9 @@ def _process_movie(self, item, source_language, target_language, forced, hi, sub
178183

179184
# Find source subtitle
180185
source_subtitle_path = subtitle_path
186+
detected_source_lang = None
181187
if not source_subtitle_path:
182-
source_subtitle_path = self._find_subtitle_by_language(
188+
source_subtitle_path, detected_source_lang = self._find_subtitle_by_language(
183189
movie.subtitles, source_language, video_path, media_type='movie'
184190
)
185191

@@ -189,6 +195,10 @@ def _process_movie(self, item, source_language, target_language, forced, hi, sub
189195
'error': f'No subtitle found for movie {radarr_id} (requested source: {source_language})'
190196
}
191197

198+
# Use detected language if available
199+
if detected_source_lang:
200+
source_language = detected_source_lang
201+
192202
# Queue translation
193203
try:
194204
result = translate_subtitles_file(
@@ -222,58 +232,44 @@ def _find_subtitle_by_language(self, subtitles, language_code, video_path, media
222232
media_type: Either 'movie' or 'series' for correct path mapping
223233
224234
Returns:
225-
Path to the subtitle file, or None if no subtitles available
235+
Tuple of (Path to the subtitle file, detected language code), or (None, None) if no subtitles available
226236
"""
227-
import json
237+
import ast
228238
import os
229239

230240
logger.debug(f'Looking for "{language_code}" subtitle. Subtitles data type: {type(subtitles)}')
231241

232-
if not subtitles:
233-
logger.debug('No subtitles data found in database for this media')
234-
return None
235-
236-
# Parse subtitles if it's a string (JSON)
237-
if isinstance(subtitles, str):
238-
try:
239-
subtitles = json.loads(subtitles)
240-
except json.JSONDecodeError:
241-
logger.error('Failed to parse subtitles JSON from database')
242-
return None
243-
244-
if not isinstance(subtitles, list):
245-
logger.debug(f'Subtitles is not a list: {type(subtitles)}')
246-
return None
247-
248-
logger.debug(f'Found {len(subtitles)} subtitle(s) in database')
249-
250-
# Collect available subtitles with their paths for better processing
251242
available_subtitles = []
252-
for sub in subtitles:
253-
if isinstance(sub, dict):
254-
sub_code = sub.get('code2', '')
255-
sub_path = sub.get('path', '')
256-
sub_hi = sub.get('hi', False)
257-
sub_forced = sub.get('forced', False)
258-
259-
if sub_path:
260-
available_subtitles.append({
261-
'code2': sub_code,
262-
'path': sub_path,
263-
'hi': sub_hi,
264-
'forced': sub_forced
265-
})
266-
267-
available_codes = [s['code2'] for s in available_subtitles if s['code2']]
268-
269-
if available_codes:
270-
logger.info(f'Available subtitle language codes: {available_codes}')
271-
else:
272-
logger.warning('No language codes found in subtitle data')
273243

274-
if not available_subtitles:
275-
logger.warning('No subtitle files with valid paths found')
276-
return None
244+
if subtitles:
245+
# Parse subtitles if it's a string (Python literal from DB)
246+
if isinstance(subtitles, str):
247+
try:
248+
subtitles = ast.literal_eval(subtitles)
249+
except (ValueError, SyntaxError):
250+
logger.error('Failed to parse subtitles from database')
251+
subtitles = []
252+
253+
if isinstance(subtitles, list):
254+
logger.debug(f'Found {len(subtitles)} subtitle(s) in database')
255+
256+
# Collect available subtitles with their paths for better processing
257+
for sub in subtitles:
258+
# DB format is [lang_str, path, size]
259+
if isinstance(sub, (list, tuple)) and len(sub) >= 2:
260+
lang_parts = sub[0].split(':')
261+
sub_code = lang_parts[0]
262+
sub_path = sub[1]
263+
sub_hi = len(lang_parts) > 1 and lang_parts[1].lower() == 'hi'
264+
sub_forced = len(lang_parts) > 1 and lang_parts[1].lower() == 'forced'
265+
266+
if sub_path:
267+
available_subtitles.append({
268+
'code2': sub_code,
269+
'path': sub_path,
270+
'hi': sub_hi,
271+
'forced': sub_forced
272+
})
277273

278274
# Helper function to resolve and validate subtitle path
279275
def resolve_subtitle_path(sub_path):
@@ -294,7 +290,7 @@ def resolve_subtitle_path(sub_path):
294290

295291
return None
296292

297-
# First pass: Look for exact language match
293+
# First pass: Look for exact language match in DB
298294
exact_matches = [s for s in available_subtitles if s['code2'] == language_code]
299295

300296
# Sort matches: prefer non-HI, non-forced first, then HI, then forced
@@ -305,29 +301,105 @@ def resolve_subtitle_path(sub_path):
305301
if resolved_path:
306302
logger.info(f'Found exact language match "{language_code}" at {resolved_path} '
307303
f'(hi={sub["hi"]}, forced={sub["forced"]})')
308-
return resolved_path
304+
return resolved_path, sub['code2']
309305

310-
# Second pass: If no exact match found, try any available subtitle
311-
logger.info(f'No exact match for "{language_code}" found. '
312-
f'Falling back to any available subtitle.')
306+
# Second pass: If no exact match found in DB, try any available subtitle from DB
307+
if available_subtitles:
308+
logger.info(f'No exact match for "{language_code}" found in DB. '
309+
f'Falling back to any available subtitle from DB.')
310+
311+
# Sort all available: prefer non-HI, non-forced, and prioritize common languages
312+
common_languages = ['en', 'eng'] # English often has good quality subs
313+
314+
def sort_key(sub):
315+
is_common = sub['code2'] in common_languages
316+
return (sub['forced'], sub['hi'], not is_common)
317+
318+
available_subtitles.sort(key=sort_key)
319+
320+
for sub in available_subtitles:
321+
resolved_path = resolve_subtitle_path(sub['path'])
322+
if resolved_path:
323+
logger.warning(f'Using fallback subtitle with language "{sub["code2"]}" at {resolved_path} '
324+
f'(hi={sub["hi"]}, forced={sub["forced"]}). '
325+
f'Requested language was "{language_code}".')
326+
return resolved_path, sub['code2']
327+
328+
# Third pass: Scan filesystem fallback
329+
logger.info(f'No usable subtitle found in DB. Scanning filesystem near {video_path}')
330+
filesystem_subs = self._scan_filesystem_for_subtitles(video_path)
331+
332+
if filesystem_subs:
333+
# Prefer English
334+
for sub in filesystem_subs:
335+
if sub['is_english']:
336+
logger.info(f'Found English subtitle on filesystem: {sub["path"]}')
337+
return sub['path'], 'en'
338+
339+
# Use first available
340+
sub = filesystem_subs[0]
341+
logger.info(f'Using non-English subtitle from filesystem: {sub["path"]} (detected: {sub["detected_language"]})')
342+
return sub['path'], sub['detected_language']
313343

314-
# Sort all available: prefer non-HI, non-forced, and prioritize common languages
315-
common_languages = ['en', 'eng'] # English often has good quality subs
344+
logger.warning(f'No usable subtitle files found in DB or on filesystem.')
345+
return None, None
346+
347+
def _scan_filesystem_for_subtitles(self, video_path):
348+
"""Scan filesystem for .srt files next to the video file."""
349+
import os
350+
import re
351+
352+
ENGLISH_PATTERNS = [
353+
r'\.en\.srt$', r'\.eng\.srt$', r'\.english\.srt$',
354+
r'[._-]en[._-]', r'[._-]eng[._-]', r'[._-]english[._-]',
355+
]
316356

317-
def sort_key(sub):
318-
is_common = sub['code2'] in common_languages
319-
return (sub['forced'], sub['hi'], not is_common)
357+
video_dir = os.path.dirname(video_path)
358+
video_name = os.path.splitext(os.path.basename(video_path))[0]
359+
results = []
320360

321-
available_subtitles.sort(key=sort_key)
361+
# Search directories
362+
search_dirs = [video_dir]
363+
for subfolder in ['Subs', 'Subtitles', 'subs', 'subtitles', video_name]:
364+
subdir = os.path.join(video_dir, subfolder)
365+
if os.path.isdir(subdir):
366+
search_dirs.append(subdir)
322367

323-
for sub in available_subtitles:
324-
resolved_path = resolve_subtitle_path(sub['path'])
325-
if resolved_path:
326-
logger.warning(f'Using fallback subtitle with language "{sub["code2"]}" at {resolved_path} '
327-
f'(hi={sub["hi"]}, forced={sub["forced"]}). '
328-
f'Requested language was "{language_code}".')
329-
return resolved_path
368+
for directory in search_dirs:
369+
try:
370+
for filename in os.listdir(directory):
371+
if filename.lower().endswith('.srt'):
372+
full_path = os.path.join(directory, filename)
373+
374+
# Detect language from filename
375+
is_english = any(re.search(p, filename.lower()) for p in ENGLISH_PATTERNS)
376+
detected_lang = 'en' if is_english else self._detect_language_from_content(full_path)
377+
378+
results.append({
379+
'path': full_path,
380+
'filename': filename,
381+
'is_english': is_english or detected_lang == 'en',
382+
'detected_language': detected_lang or 'und'
383+
})
384+
except OSError:
385+
continue
330386

331-
logger.warning(f'No usable subtitle files found. '
332-
f'Checked {len(available_subtitles)} subtitle(s), none exist on disk.')
333-
return None
387+
# Sort: English first
388+
results.sort(key=lambda x: (not x['is_english'], x['filename']))
389+
return results
390+
391+
def _detect_language_from_content(self, srt_path):
392+
"""Detect language by analyzing subtitle content."""
393+
from guess_language import guess_language
394+
from charset_normalizer import detect
395+
try:
396+
with open(srt_path, 'rb') as f:
397+
raw = f.read(8192) # Read first 8KB
398+
399+
encoding = detect(raw)
400+
if encoding and encoding.get('encoding'):
401+
text = raw.decode(encoding['encoding'], errors='ignore')
402+
return guess_language(text)
403+
except Exception:
404+
pass
405+
return None

0 commit comments

Comments
 (0)