cdgriffith · mikeSGman · Oct 15, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 20, 2025
diff --git a/FastFlix_Windows_OneFile.spec b/FastFlix_Windows_OneFile.spec
@@ -27,6 +27,9 @@ all_imports.remove("python-box")
 all_imports.append("box")
 all_imports.append("iso639")
 
+# Add pgsrip for OCR support
+all_imports.extend(["pgsrip", "pytesseract", "cv2", "numpy", "pysrt", "babelfish", "cleanit"])
+
 portable_file = "fastflix\\portable.py"
 with open(portable_file, "w") as portable:
     portable.write(" ")

diff --git a/fastflix/models/config.py b/fastflix/models/config.py
@@ -99,6 +99,77 @@ def where(filename: str, portable_mode=False) -> Path | None:
     return None
 
 
+def find_ocr_tool(name):
+    """Find OCR tools (tesseract, mkvmerge, pgsrip) similar to how we find FFmpeg"""
+    # Check environment variable
+    if ocr_location := os.getenv(f"FF_{name.upper()}"):
+        return Path(ocr_location).absolute()
+
+    # Check system PATH
+    if (ocr_location := shutil.which(name)) is not None:
+        return Path(ocr_location).absolute()
+
+    # Special handling for tesseract on Windows (not in PATH by default)
+    if name == "tesseract" and win_based:
+        # Check common install locations on all drives
+        import string
+        drives = [f"{d}:" for d in string.ascii_uppercase if Path(f"{d}:/").exists()]
+
+        for drive in drives:
+            common_paths = [
+                Path(f"{drive}/Program Files/Tesseract-OCR/tesseract.exe"),
+                Path(f"{drive}/Program Files (x86)/Tesseract-OCR/tesseract.exe"),
+            ]
+            for path in common_paths:
+                if path.exists():
+                    return path
+
+        # Check Windows registry for Tesseract install location
+        try:
+            import winreg
+            # Try HKEY_LOCAL_MACHINE first (system-wide install)
+            for root_key in [winreg.HKEY_LOCAL_MACHINE, winreg.HKEY_CURRENT_USER]:
+                try:
+                    key = winreg.OpenKey(root_key, r"SOFTWARE\Tesseract-OCR")
+                    install_path = winreg.QueryValueEx(key, "InstallDir")[0]
+                    winreg.CloseKey(key)
+                    tesseract_exe = Path(install_path) / "tesseract.exe"
+                    if tesseract_exe.exists():
+                        return tesseract_exe
+                except (FileNotFoundError, OSError):
+                    pass
+        except ImportError:
+            pass
+
+    # Special handling for mkvmerge on Windows
+    if name == "mkvmerge" and win_based:
+        import string
+        drives = [f"{d}:" for d in string.ascii_uppercase if Path(f"{d}:/").exists()]
+
+        for drive in drives:
+            common_paths = [
+                Path(f"{drive}/Program Files/MKVToolNix/mkvmerge.exe"),
+                Path(f"{drive}/Program Files (x86)/MKVToolNix/mkvmerge.exe"),
+            ]
+            for path in common_paths:
+                if path.exists():
+                    return path
+
+    # Check in FastFlix OCR tools folder
+    ocr_folder = Path(user_data_dir("FastFlix_OCR", appauthor=False, roaming=True))
+    if ocr_folder.exists():
+        for file in ocr_folder.iterdir():
+            if file.is_file() and file.name.lower() in (name, f"{name}.exe"):
+                return file
+        # Check bin subfolder
+        if (ocr_folder / "bin").exists():
+            for file in (ocr_folder / "bin").iterdir():
+                if file.is_file() and file.name.lower() in (name, f"{name}.exe"):
+                    return file
+
+    return None
+
+
 class Config(BaseModel):
     version: str = __version__
     config_path: Path = Field(default_factory=get_config)
@@ -168,6 +239,13 @@ class Config(BaseModel):
 
     disable_cover_extraction: bool = False
 
+    # PGS to SRT OCR Settings
+    enable_pgs_ocr: bool = False
+    tesseract_path: Path | None = Field(default_factory=lambda: find_ocr_tool("tesseract"))
+    mkvmerge_path: Path | None = Field(default_factory=lambda: find_ocr_tool("mkvmerge"))
+    pgsrip_path: Path | None = Field(default_factory=lambda: find_ocr_tool("pgsrip"))
+    pgs_ocr_language: str = "eng"
+
     def encoder_opt(self, profile_name, profile_option_name):
         encoder_settings = getattr(self.profiles[self.selected_profile], profile_name)
         if encoder_settings:

diff --git a/fastflix/widgets/background_tasks.py b/fastflix/widgets/background_tasks.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 import logging
 import os
+import shutil
 from pathlib import Path
 from subprocess import PIPE, STDOUT, Popen, run, check_output
 from packaging import version
@@ -46,13 +47,14 @@ def run(self):
 
 
 class ExtractSubtitleSRT(QtCore.QThread):
-    def __init__(self, app: FastFlixApp, main, index, signal, language):
+    def __init__(self, app: FastFlixApp, main, index, signal, language, use_ocr=False):
         super().__init__(main)
         self.main = main
         self.app = app
         self.index = index
         self.signal = signal
         self.language = language
+        self.use_ocr = use_ocr
 
     def run(self):
         subtitle_format = self._get_subtitle_format()
@@ -63,6 +65,9 @@ def run(self):
             self.signal.emit()
             return
 
+        # Flag to track if we need OCR conversion after extraction
+        should_convert_to_srt = False
+
         if subtitle_format == "srt":
             extension = "srt"
             output_args = ["-c", "srt", "-f", "srt"]
@@ -75,6 +80,8 @@ def run(self):
         elif subtitle_format == "pgs":
             extension = "sup"
             output_args = ["-c", "copy"]
+            # If OCR is requested, we'll extract .sup first, then convert after
+            should_convert_to_srt = self.use_ocr and self.app.fastflix.config.enable_pgs_ocr
         else:
             self.main.thread_logging_signal.emit(
                 f"WARNING:{t('Subtitle Track')} {self.index} {t('is not in supported format (SRT, ASS, SSA, PGS), skipping extraction')}: {subtitle_format}"
@@ -115,6 +122,13 @@ def run(self):
                 )
             else:
                 self.main.thread_logging_signal.emit(f"INFO:{t('Extracted subtitles successfully')}")
+
+                # If this is PGS and OCR was requested, convert the .sup to .srt
+                if subtitle_format == "pgs" and should_convert_to_srt:
+                    if self._convert_sup_to_srt(filename):
+                        self.main.thread_logging_signal.emit(f"INFO:{t('Successfully converted to SRT with OCR')}")
+                    else:
+                        self.main.thread_logging_signal.emit(f"WARNING:{t('OCR conversion failed, kept .sup file')}")
         self.signal.emit()
 
     def _get_subtitle_format(self):
@@ -164,6 +178,132 @@ def _get_subtitle_format(self):
             )
             return None
 
+    def _check_pgsrip_dependencies(self) -> bool:
+        """Check all required dependencies for pgsrip OCR conversion"""
+        missing = []
+
+        # Check tesseract (auto-detected from PATH or config)
+        if not self.app.fastflix.config.tesseract_path:
+            missing.append("tesseract-ocr")
+
+        # Check mkvmerge (CRITICAL - required by pgsrip but not documented)
+        if not self.app.fastflix.config.mkvmerge_path:
+            missing.append("mkvtoolnix")
+
+        # Check pgsrip
+        if not self.app.fastflix.config.pgsrip_path:
+            missing.append("pgsrip")
+
+        if missing:
+            self.main.thread_logging_signal.emit(
+                f"ERROR:{t('Missing dependencies for PGS OCR')}: {', '.join(missing)}\n\n"
+                f"Install instructions:\n"
+                f"  Windows: Run setup_pgs_ocr_windows.bat in FastFlix folder\n"
+                f"  Linux: sudo apt install tesseract-ocr mkvtoolnix && pip install pgsrip\n"
+                f"  macOS: brew install tesseract mkvtoolnix && pip install pgsrip\n\n"
+                f"Or download manually:\n"
+                f"  Tesseract: https://github.com/UB-Mannheim/tesseract/wiki\n"
+                f"  MKVToolNix: https://mkvtoolnix.download/downloads.html\n"
+                f"  pgsrip: pip install pgsrip"
+            )
+            return False
+
+        return True
+
+    def _convert_sup_to_srt(self, sup_filepath: str) -> bool:
+        """Convert an already-extracted .sup file to .srt using pgsrip OCR
+
+        Args:
+            sup_filepath: Path to the extracted .sup file
+
+        Returns:
+            True if conversion successful, False otherwise
+        """
+        # Check dependencies first
+        if not self._check_pgsrip_dependencies():
+            return False
+
+        try:
+            self.main.thread_logging_signal.emit(
+                f"INFO:{t('Converting .sup to .srt using OCR')} (this may take 3-5 minutes)..."
+            )
+
+            # Convert 3-letter language code to 2-letter for pgsrip
+            # pgsrip uses 2-letter codes in filenames (e.g., "en" not "eng")
+            from fastflix.language import Language
+            try:
+                lang_2letter = Language(self.language).pt1  # Convert eng -> en
+            except:
+                lang_2letter = "en"  # Default to English if conversion fails
+
+            # Rename .sup file to use 2-letter language code (what pgsrip expects)
+            sup_path = Path(sup_filepath)
+            if f".{self.language}." in sup_path.name:
+                # Replace 3-letter with 2-letter in filename
+                new_name = sup_path.name.replace(f".{self.language}.", f".{lang_2letter}.")
+                new_sup_path = sup_path.parent / new_name
+                sup_path.rename(new_sup_path)
+                sup_filepath = str(new_sup_path)
+
+            # Run pgsrip on the already-extracted .sup file
+            pgsrip_cmd = str(self.app.fastflix.config.pgsrip_path) if self.app.fastflix.config.pgsrip_path else "pgsrip"
+
+            # Set environment variables for pgsrip to find tesseract
+            import os
+            env = os.environ.copy()
+            if self.app.fastflix.config.tesseract_path:
+                # Add tesseract directory to PATH so pytesseract can find it
+                tesseract_dir = str(Path(self.app.fastflix.config.tesseract_path).parent)
+                env['PATH'] = f"{tesseract_dir}{os.pathsep}{env.get('PATH', '')}"
+                env['TESSERACT_CMD'] = str(self.app.fastflix.config.tesseract_path)
+
+            pgsrip_result = run(
+                [
+                    pgsrip_cmd,
+                    "--language", lang_2letter,  # Use 2-letter code (e.g., "en", "es", "fr")
+                    "--force",                    # Overwrite existing files
+                    sup_filepath
+                ],
+                capture_output=True,
+                text=True,
+                timeout=600,  # 10 minute timeout for OCR
+                env=env  # Pass environment with TESSERACT_CMD
+            )
+
+            if pgsrip_result.returncode != 0:
+                error_msg = pgsrip_result.stderr if pgsrip_result.stderr else pgsrip_result.stdout
+                raise Exception(f"pgsrip failed with return code {pgsrip_result.returncode}: {error_msg}")
+
+            # pgsrip creates .srt file in same directory as .sup file
+            sup_path = Path(sup_filepath)
+            expected_srt = sup_path.with_suffix('.srt')
+
+            if not expected_srt.exists():
+                # Look for any .srt file created near the .sup
+                srt_files = list(sup_path.parent.glob("*.srt"))
+                if not srt_files:
+                    raise Exception(f"pgsrip completed but no .srt file found in {sup_path.parent}")
+                expected_srt = srt_files[0]
+
+            self.main.thread_logging_signal.emit(
+                f"INFO:{t('OCR conversion successful')}: {expected_srt.name}"
+            )
+
+            # Optionally delete the .sup file since we have .srt now
+            try:
+                sup_path.unlink()
+                self.main.thread_logging_signal.emit(f"INFO:{t('Removed .sup file, kept .srt')}")
+            except:
+                pass
+
+            return True
+
+        except Exception as err:
+            self.main.thread_logging_signal.emit(
+                f"ERROR:{t('OCR conversion failed')}: {err}"
+            )
+            return False
+
 
 class AudioNoramlize(QtCore.QThread):
     def __init__(self, app: FastFlixApp, main, audio_type, signal):

diff --git a/fastflix/widgets/panels/subtitle_panel.py b/fastflix/widgets/panels/subtitle_panel.py
@@ -106,8 +106,32 @@ def __init__(self, app, parent, index, enabled=True, first=False):
             {t("Cannot remove afterwards!")}
             """
         )
-        self.widgets.extract = QtWidgets.QPushButton(t("Extract"))
-        self.widgets.extract.clicked.connect(self.extract)
+
+        # Setup extract button with OCR option for PGS subtitles
+        if sub_track.subtitle_type == "pgs":
+            self.widgets.extract = QtWidgets.QPushButton(t("Extract"))
+            extract_menu = QtWidgets.QMenu(self)
+
+            # Always offer .sup extraction (fast, no dependencies)
+            extract_menu.addAction(t("Extract as .sup (image - fast)"), lambda: self.extract(use_ocr=False))
+
+            # Check if OCR dependencies are available
+            ocr_action = extract_menu.addAction(t("Convert to .srt (OCR - 3-5 min)"), lambda: self.extract(use_ocr=True))
+
+            # Enable OCR option only if user enabled it AND dependencies are available
+            if not self.app.fastflix.config.enable_pgs_ocr:
+                ocr_action.setEnabled(False)
+                ocr_action.setToolTip(t("Enable in Settings > 'Enable PGS to SRT OCR conversion'"))
+            elif not (self.app.fastflix.config.tesseract_path and
+                      self.app.fastflix.config.mkvmerge_path and
+                      self.app.fastflix.config.pgsrip_path):
+                ocr_action.setEnabled(False)
+                ocr_action.setToolTip(t("Missing dependencies: tesseract, mkvtoolnix, or pgsrip"))
+
+            self.widgets.extract.setMenu(extract_menu)
+        else:
+            self.widgets.extract = QtWidgets.QPushButton(t("Extract"))
+            self.widgets.extract.clicked.connect(self.extract)
 
         self.gif_label = QtWidgets.QLabel(self)
         self.movie = QtGui.QMovie(loading_movie)
@@ -167,9 +191,10 @@ def init_move_buttons(self):
         layout.addWidget(self.widgets.down_button)
         return layout
 
-    def extract(self):
+    def extract(self, use_ocr=False):
         worker = ExtractSubtitleSRT(
-            self.parent.app, self.parent.main, self.index, self.extract_completed_signal, language=self.language
+            self.parent.app, self.parent.main, self.index, self.extract_completed_signal,
+            language=self.language, use_ocr=use_ocr
         )
         worker.start()
         self.gif_label.show()