22# -*- coding: utf-8 -*-
33import logging
44import os
5+ import shutil
56from pathlib import Path
67from subprocess import PIPE , STDOUT , Popen , run , check_output
78from packaging import version
@@ -46,13 +47,14 @@ def run(self):
4647
4748
4849class ExtractSubtitleSRT (QtCore .QThread ):
49- def __init__ (self , app : FastFlixApp , main , index , signal , language ):
50+ def __init__ (self , app : FastFlixApp , main , index , signal , language , use_ocr = False ):
5051 super ().__init__ (main )
5152 self .main = main
5253 self .app = app
5354 self .index = index
5455 self .signal = signal
5556 self .language = language
57+ self .use_ocr = use_ocr
5658
5759 def run (self ):
5860 subtitle_format = self ._get_subtitle_format ()
@@ -63,6 +65,9 @@ def run(self):
6365 self .signal .emit ()
6466 return
6567
68+ # Flag to track if we need OCR conversion after extraction
69+ should_convert_to_srt = False
70+
6671 if subtitle_format == "srt" :
6772 extension = "srt"
6873 output_args = ["-c" , "srt" , "-f" , "srt" ]
@@ -75,6 +80,8 @@ def run(self):
7580 elif subtitle_format == "pgs" :
7681 extension = "sup"
7782 output_args = ["-c" , "copy" ]
83+ # If OCR is requested, we'll extract .sup first, then convert after
84+ should_convert_to_srt = self .use_ocr and self .app .fastflix .config .enable_pgs_ocr
7885 else :
7986 self .main .thread_logging_signal .emit (
8087 f"WARNING:{ t ('Subtitle Track' )} { self .index } { t ('is not in supported format (SRT, ASS, SSA, PGS), skipping extraction' )} : { subtitle_format } "
@@ -115,6 +122,13 @@ def run(self):
115122 )
116123 else :
117124 self .main .thread_logging_signal .emit (f"INFO:{ t ('Extracted subtitles successfully' )} " )
125+
126+ # If this is PGS and OCR was requested, convert the .sup to .srt
127+ if subtitle_format == "pgs" and should_convert_to_srt :
128+ if self ._convert_sup_to_srt (filename ):
129+ self .main .thread_logging_signal .emit (f"INFO:{ t ('Successfully converted to SRT with OCR' )} " )
130+ else :
131+ self .main .thread_logging_signal .emit (f"WARNING:{ t ('OCR conversion failed, kept .sup file' )} " )
118132 self .signal .emit ()
119133
120134 def _get_subtitle_format (self ):
@@ -164,6 +178,132 @@ def _get_subtitle_format(self):
164178 )
165179 return None
166180
181+ def _check_pgsrip_dependencies (self ) -> bool :
182+ """Check all required dependencies for pgsrip OCR conversion"""
183+ missing = []
184+
185+ # Check tesseract (auto-detected from PATH or config)
186+ if not self .app .fastflix .config .tesseract_path :
187+ missing .append ("tesseract-ocr" )
188+
189+ # Check mkvmerge (CRITICAL - required by pgsrip but not documented)
190+ if not self .app .fastflix .config .mkvmerge_path :
191+ missing .append ("mkvtoolnix" )
192+
193+ # Check pgsrip
194+ if not self .app .fastflix .config .pgsrip_path :
195+ missing .append ("pgsrip" )
196+
197+ if missing :
198+ self .main .thread_logging_signal .emit (
199+ f"ERROR:{ t ('Missing dependencies for PGS OCR' )} : { ', ' .join (missing )} \n \n "
200+ f"Install instructions:\n "
201+ f" Windows: Run setup_pgs_ocr_windows.bat in FastFlix folder\n "
202+ f" Linux: sudo apt install tesseract-ocr mkvtoolnix && pip install pgsrip\n "
203+ f" macOS: brew install tesseract mkvtoolnix && pip install pgsrip\n \n "
204+ f"Or download manually:\n "
205+ f" Tesseract: https://github.com/UB-Mannheim/tesseract/wiki\n "
206+ f" MKVToolNix: https://mkvtoolnix.download/downloads.html\n "
207+ f" pgsrip: pip install pgsrip"
208+ )
209+ return False
210+
211+ return True
212+
213+ def _convert_sup_to_srt (self , sup_filepath : str ) -> bool :
214+ """Convert an already-extracted .sup file to .srt using pgsrip OCR
215+
216+ Args:
217+ sup_filepath: Path to the extracted .sup file
218+
219+ Returns:
220+ True if conversion successful, False otherwise
221+ """
222+ # Check dependencies first
223+ if not self ._check_pgsrip_dependencies ():
224+ return False
225+
226+ try :
227+ self .main .thread_logging_signal .emit (
228+ f"INFO:{ t ('Converting .sup to .srt using OCR' )} (this may take 3-5 minutes)..."
229+ )
230+
231+ # Convert 3-letter language code to 2-letter for pgsrip
232+ # pgsrip uses 2-letter codes in filenames (e.g., "en" not "eng")
233+ from fastflix .language import Language
234+ try :
235+ lang_2letter = Language (self .language ).pt1 # Convert eng -> en
236+ except :
237+ lang_2letter = "en" # Default to English if conversion fails
238+
239+ # Rename .sup file to use 2-letter language code (what pgsrip expects)
240+ sup_path = Path (sup_filepath )
241+ if f".{ self .language } ." in sup_path .name :
242+ # Replace 3-letter with 2-letter in filename
243+ new_name = sup_path .name .replace (f".{ self .language } ." , f".{ lang_2letter } ." )
244+ new_sup_path = sup_path .parent / new_name
245+ sup_path .rename (new_sup_path )
246+ sup_filepath = str (new_sup_path )
247+
248+ # Run pgsrip on the already-extracted .sup file
249+ pgsrip_cmd = str (self .app .fastflix .config .pgsrip_path ) if self .app .fastflix .config .pgsrip_path else "pgsrip"
250+
251+ # Set environment variables for pgsrip to find tesseract
252+ import os
253+ env = os .environ .copy ()
254+ if self .app .fastflix .config .tesseract_path :
255+ # Add tesseract directory to PATH so pytesseract can find it
256+ tesseract_dir = str (Path (self .app .fastflix .config .tesseract_path ).parent )
257+ env ['PATH' ] = f"{ tesseract_dir } { os .pathsep } { env .get ('PATH' , '' )} "
258+ env ['TESSERACT_CMD' ] = str (self .app .fastflix .config .tesseract_path )
259+
260+ pgsrip_result = run (
261+ [
262+ pgsrip_cmd ,
263+ "--language" , lang_2letter , # Use 2-letter code (e.g., "en", "es", "fr")
264+ "--force" , # Overwrite existing files
265+ sup_filepath
266+ ],
267+ capture_output = True ,
268+ text = True ,
269+ timeout = 600 , # 10 minute timeout for OCR
270+ env = env # Pass environment with TESSERACT_CMD
271+ )
272+
273+ if pgsrip_result .returncode != 0 :
274+ error_msg = pgsrip_result .stderr if pgsrip_result .stderr else pgsrip_result .stdout
275+ raise Exception (f"pgsrip failed with return code { pgsrip_result .returncode } : { error_msg } " )
276+
277+ # pgsrip creates .srt file in same directory as .sup file
278+ sup_path = Path (sup_filepath )
279+ expected_srt = sup_path .with_suffix ('.srt' )
280+
281+ if not expected_srt .exists ():
282+ # Look for any .srt file created near the .sup
283+ srt_files = list (sup_path .parent .glob ("*.srt" ))
284+ if not srt_files :
285+ raise Exception (f"pgsrip completed but no .srt file found in { sup_path .parent } " )
286+ expected_srt = srt_files [0 ]
287+
288+ self .main .thread_logging_signal .emit (
289+ f"INFO:{ t ('OCR conversion successful' )} : { expected_srt .name } "
290+ )
291+
292+ # Optionally delete the .sup file since we have .srt now
293+ try :
294+ sup_path .unlink ()
295+ self .main .thread_logging_signal .emit (f"INFO:{ t ('Removed .sup file, kept .srt' )} " )
296+ except :
297+ pass
298+
299+ return True
300+
301+ except Exception as err :
302+ self .main .thread_logging_signal .emit (
303+ f"ERROR:{ t ('OCR conversion failed' )} : { err } "
304+ )
305+ return False
306+
167307
168308class AudioNoramlize (QtCore .QThread ):
169309 def __init__ (self , app : FastFlixApp , main , audio_type , signal ):
0 commit comments