ByteTrix · kovyrin · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.env.example b/.env.example
@@ -61,6 +61,9 @@ RESUME_PARTIAL=true
 # Enable detailed logging for troubleshooting
 DEBUG=false
 
+# Download subtitles/captions when available (default: true)
+SUBTITLE_DOWNLOAD_ENABLED=true
+
 # ===============================================
 # ADVANCED SETTINGS
 # ===============================================
@@ -83,4 +86,4 @@ COURSE_DATA_FILE=""
 # ALL_VIDEO_FORMATS=false
 
 # Log level (DEBUG, INFO, WARNING, ERROR)
-# LOG_LEVEL="INFO"
+# LOG_LEVEL="INFO"
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
 | 📄 **HTML Content** | ✅ Full | `downloader.py` | Clean extraction, formatting |
 | 📚 **PDF Documents** | ✅ Full | `downloader.py` | Direct download, validation |
 | 🎵 **Audio Files** | ✅ Full | `downloader.py` | MP3, M4A support |
+| 📝 **Subtitles (Wistia)** | ✅ Full | `wistia_downloader.py` | Multi-language caption downloads |
 | 🎯 **Quizzes** | ✅ Basic | `downloader.py` | Structure extraction |
 | 🎨 **Presentations** | ✅ Full | FFmpeg merge | Multi-slide processing |
 
@@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
 - **Resume Support** - Skip existing files, continue interrupted downloads
 - **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux
 - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.)
+- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages
 - **Comprehensive Logging** - Debug mode for troubleshooting
 
 ### 🛡️ **Safety & Compliance**
@@ -201,6 +203,7 @@ RATE_LIMIT_MB_S=            # Rate limit in MB/s (empty = unlimited)
 VALIDATE_DOWNLOADS=true     # Enable file integrity validation
 RESUME_PARTIAL=true         # Enable resume for partial downloads
 DEBUG=false                 # Enable debug logging
+SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available
 
 # ===============================================
 # ADVANCED SETTINGS

diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py
@@ -37,6 +37,7 @@ class Settings:
     resume_partial: bool = True
     debug: bool = False
     course_name: str = "Course"
+    subtitle_download_enabled: bool = True
 
     @classmethod
     def from_env(cls):
@@ -67,6 +68,7 @@ def from_env(cls):
         validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on')
         resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on')
         debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on')
+        subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on')
 
         # Clean cookie data to remove Unicode characters that cause encoding issues
         if cookie_data:
@@ -101,5 +103,6 @@ def from_env(cls):
             download_delay=download_delay,
             validate_downloads=validate_downloads,
             resume_partial=resume_partial,
-            debug=debug
+            debug=debug,
+            subtitle_download_enabled=subtitle_download_enabled
         )
diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py
@@ -386,6 +386,9 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1):
 def init_course(data: Dict[str, Any]):
     """Initialize course structure and collect ALL download tasks first."""
     global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS
+
+    # Ensure settings/download manager are initialized so feature flags are available
+    init_settings()
 
     # Initialize download tasks list
     DOWNLOAD_TASKS = []
@@ -417,6 +420,21 @@ def init_course(data: Dict[str, Any]):
                 analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
                 saved_tasks = cache_data.get('download_tasks', [])
                 print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
+                # If subtitle downloads are enabled but cached tasks do not contain subtitles,
+                # treat cache as outdated so we can regenerate tasks with captions.
+                if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks:
+                    has_subtitle_tasks = any(
+                        (task.get('content_type') or '').lower() == 'subtitle'
+                        for task in saved_tasks
+                    )
+                    if not has_subtitle_tasks:
+                        print("🆕 Subtitle support enabled — refreshing cached analysis to include captions.")
+                        analyzed_chapters = set()
+                        saved_tasks = []
+                        try:
+                            cache_file.unlink()
+                        except Exception:
+                            pass
         except:
             analyzed_chapters = set()
             saved_tasks = []
@@ -835,9 +853,16 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path):
             video_url = selected.get('url')
             if video_url:
                 ext = '.mp4'  # Default extension
-                resolved_name = filter_filename(file_name) + ext
+                resolved_name = filter_filename(file_name)
+                if not resolved_name.lower().endswith(ext):
+                    resolved_name += ext
                 print(f"   📹 Found video: {resolved_name}")
                 add_download_task(video_url, dest_dir / resolved_name, "video")
+                try:
+                    from .wistia_downloader import queue_wistia_subtitle_downloads
+                    queue_wistia_subtitle_downloads(data.get('media') or {}, dest_dir, resolved_name)
+                except Exception as subtitle_error:
+                    print(f"   ⚠️  Unable to queue subtitles for {resolved_name}: {subtitle_error}")
     except Exception as e:
         print(f"   ❌ Failed to collect Wistia video {wistia_id}: {e}")
 
@@ -1282,4 +1307,4 @@ def main(argv: List[str]):
 
 
 if __name__ == '__main__':
-    main(sys.argv)
+    main(sys.argv)
diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py
@@ -1,19 +1,233 @@
 import json
+import os
 import re
-import requests
 import zlib
-from typing import Optional, List
 from pathlib import Path
-import os
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+import requests
+
 from .file_utils import filter_filename
-from .download_manager import DownloadManager
 # Local imports inside functions to avoid circular dependency during module import
 
 # Handles video proxy and wistia direct downloads
 
 WISTIA_JSON_URL = "https://fast.wistia.com/embed/medias/{id}.json"
 
 VIDEO_PROXY_JSONP_ID_PATTERN = re.compile(r"medias/(\w+)\.jsonp")
+DEFAULT_SUBTITLE_EXTENSION = "vtt"
+_LANGUAGE_SANITIZE_PATTERN = re.compile(r'[^A-Za-z0-9\-]+')
+
+
+def _normalize_wistia_track_url(url: Optional[str]) -> Optional[str]:
+    """Normalize Wistia caption track URLs to absolute HTTPS URLs."""
+    if not url or not isinstance(url, str):
+        return None
+
+    normalized = url.strip()
+    if not normalized:
+        return None
+
+    if normalized.startswith('//'):
+        normalized = f"https:{normalized}"
+    elif normalized.startswith('/'):
+        normalized = f"https://fast.wistia.com{normalized}"
+    elif not re.match(r'^https?://', normalized, re.IGNORECASE):
+        normalized = f"https://fast.wistia.com/{normalized.lstrip('/')}"
+
+    return normalized
+
+
+def _build_caption_url(hashed_id: Optional[str], language: Optional[str], extension: Optional[str] = None) -> Optional[str]:
+    """Construct a Wistia caption URL when only hashedId and language are available."""
+    if not hashed_id or not language:
+        return None
+
+    ext = (extension or DEFAULT_SUBTITLE_EXTENSION).lstrip('.') or DEFAULT_SUBTITLE_EXTENSION
+    return f"https://fast.wistia.com/embed/captions/{hashed_id}.{ext}?language={language}"
+
+
+def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) -> str:
+    """Infer file extension from track URL."""
+    try:
+        parsed = urlparse(url)
+        suffix = Path(parsed.path).suffix
+        if suffix:
+            return suffix.lstrip('.').lower() or fallback
+    except Exception:
+        pass
+    return fallback
+
+
+def extract_wistia_subtitle_tracks(media: Dict[str, Any]) -> List[Dict[str, Optional[str]]]:
+    """Extract subtitle/caption track metadata from Wistia media JSON."""
+    if not isinstance(media, dict):
+        return []
+
+    hashed_id = media.get('hashedId') or media.get('hashed_id')
+    tracks: List[Dict[str, Optional[str]]] = []
+
+    def add_track(url: Optional[str], language: Optional[str], label: Optional[str], ext: Optional[str]):
+        normalized = _normalize_wistia_track_url(url)
+        if not normalized and hashed_id and language:
+            normalized = _build_caption_url(hashed_id, language, ext)
+        if not normalized:
+            return
+        tracks.append({
+            'url': normalized,
+            'language': language,
+            'label': label,
+            'ext': (ext or '').lstrip('.') or None
+        })
+
+    for track in media.get('captions') or []:
+        if isinstance(track, dict):
+            add_track(
+                track.get('url') or track.get('src'),
+                track.get('language') or track.get('lang'),
+                track.get('languageName') or track.get('label') or track.get('name'),
+                track.get('ext')
+            )
+
+    for track in media.get('text_tracks') or []:
+        if not isinstance(track, dict):
+            continue
+        sources = track.get('sources') or []
+        if sources:
+            for source in sources:
+                if isinstance(source, dict):
+                    add_track(
+                        source.get('url') or source.get('src'),
+                        track.get('language') or track.get('lang'),
+                        track.get('name') or track.get('label'),
+                        source.get('ext') or track.get('ext')
+                    )
+        else:
+            add_track(
+                track.get('url') or track.get('src'),
+                track.get('language') or track.get('lang'),
+                track.get('name') or track.get('label'),
+                track.get('ext')
+            )
+
+    for track in media.get('textTracks') or []:
+        if not isinstance(track, dict):
+            continue
+        sources = track.get('sources') or []
+        if sources:
+            for source in sources:
+                if isinstance(source, dict):
+                    add_track(
+                        source.get('url') or source.get('src'),
+                        track.get('language') or track.get('lang'),
+                        track.get('name') or track.get('label') or track.get('title'),
+                        source.get('ext') or track.get('ext')
+                    )
+        else:
+            add_track(
+                track.get('url') or track.get('src'),
+                track.get('language') or track.get('lang'),
+                track.get('name') or track.get('label') or track.get('title'),
+                track.get('ext')
+            )
+
+    for asset in media.get('assets') or []:
+        if isinstance(asset, dict):
+            asset_type = (asset.get('type') or '').lower()
+            asset_kind = (asset.get('kind') or '').lower()
+            if asset_type in ('caption', 'captions', 'subtitle', 'subtitles') or asset_kind in ('caption', 'captions', 'subtitle', 'subtitles'):
+                add_track(
+                    asset.get('url') or asset.get('src'),
+                    asset.get('language') or asset.get('lang'),
+                    asset.get('display_name') or asset.get('name'),
+                    asset.get('ext')
+                )
+
+    available_transcripts = media.get('availableTranscripts') or []
+    if hashed_id and available_transcripts:
+        for transcript in available_transcripts:
+            if not isinstance(transcript, dict) or not transcript.get('hasCaptions'):
+                continue
+            language = transcript.get('language') or transcript.get('wistiaLanguageCode') or transcript.get('bcp47LanguageTag')
+            if not language:
+                continue
+            add_track(
+                _build_caption_url(hashed_id, language, DEFAULT_SUBTITLE_EXTENSION),
+                language,
+                transcript.get('name') or transcript.get('familyName') or language,
+                DEFAULT_SUBTITLE_EXTENSION
+            )
+
+    unique_tracks: Dict[str, Dict[str, Optional[str]]] = {}
+    for track in tracks:
+        url = track['url']
+        if not url:
+            continue
+        if url not in unique_tracks:
+            unique_tracks[url] = track
+        else:
+            existing = unique_tracks[url]
+            # Prefer track data that includes language/label/ext
+            if not existing.get('language') and track.get('language'):
+                existing['language'] = track['language']
+            if not existing.get('label') and track.get('label'):
+                existing['label'] = track['label']
+            if not existing.get('ext') and track.get('ext'):
+                existing['ext'] = track['ext']
+
+    return list(unique_tracks.values())
+
+
+def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video_base_name: str):
+    """Queue subtitle download tasks for a Wistia media object."""
+    from .downloader import SETTINGS, add_download_task, init_settings
+
+    if not isinstance(dest_dir, Path):
+        dest_dir = Path(dest_dir)
+
+    init_settings()
+    settings = SETTINGS
+    if settings and hasattr(settings, 'subtitle_download_enabled') and not settings.subtitle_download_enabled:
+        return
+
+    tracks = extract_wistia_subtitle_tracks(media)
+    if not tracks:
+        return
+
+    base_name = Path(video_base_name).stem
+    if not base_name:
+        fallback_name = media.get('name') or media.get('hashedId') or 'captions'
+        base_name = filter_filename(str(fallback_name))
+    else:
+        base_name = filter_filename(base_name)
+
+    if not base_name:
+        base_name = "captions"
+
+    counter = 1
+    for track in tracks:
+        url = track.get('url')
+        if not url:
+            continue
+
+        ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION
+        language_part = track.get('language') or track.get('label') or ''
+        if isinstance(language_part, (list, dict)):
+            language_part = ''
+        language_part = str(language_part or '')
+        language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_part).strip('-')
+
+        if not language_part:
+            language_part = 'captions' if counter == 1 else f"captions-{counter}"
+
+        subtitle_filename = filter_filename(f"{base_name}.{language_part}.{ext}")
+        if not subtitle_filename:
+            subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}")
+
+        print(f"   [Subs] Queued subtitles: {subtitle_filename}")
+        add_download_task(url, dest_dir / subtitle_filename, "subtitle")
+        counter += 1
 
 
 def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"):
@@ -143,6 +357,7 @@ def infer_ext(asset: dict) -> str:
         return '.mp4'
 
     resolved_base = filter_filename(file_name if file_name else media.get('name') or wistia_id)
+    current_dir = Path.cwd()
 
     if all_formats_flag:
         print(f"Downloading all available Wistia assets for {resolved_base}")
@@ -172,6 +387,7 @@ def infer_ext(asset: dict) -> str:
                 DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name)))
             else:
                 print("Download manager not initialized")
+        queue_wistia_subtitle_downloads(media, current_dir, resolved_base)
         return
 
     # Single quality path
@@ -200,6 +416,6 @@ def infer_ext(asset: dict) -> str:
 
     # Queue video for parallel download with absolute path to current directory
     from .downloader import add_download_task
-    current_dir = Path.cwd()  # Capture current working directory
     full_path = current_dir / resolved_name  # Create absolute path
     add_download_task(video_url, full_path, "video")
+    queue_wistia_subtitle_downloads(media, current_dir, resolved_name)