diff --git a/.env.example b/.env.example index 654f1ab..32056dc 100644 --- a/.env.example +++ b/.env.example @@ -61,6 +61,9 @@ RESUME_PARTIAL=true # Enable detailed logging for troubleshooting DEBUG=false +# Download subtitles/captions when available (default: true) +SUBTITLE_DOWNLOAD_ENABLED=true + # =============================================== # ADVANCED SETTINGS # =============================================== @@ -83,4 +86,4 @@ COURSE_DATA_FILE="" # ALL_VIDEO_FORMATS=false # Log level (DEBUG, INFO, WARNING, ERROR) -# LOG_LEVEL="INFO" \ No newline at end of file +# LOG_LEVEL="INFO" diff --git a/README.md b/README.md index a05530a..925986d 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor | šŸ“„ **HTML Content** | āœ… Full | `downloader.py` | Clean extraction, formatting | | šŸ“š **PDF Documents** | āœ… Full | `downloader.py` | Direct download, validation | | šŸŽµ **Audio Files** | āœ… Full | `downloader.py` | MP3, M4A support | +| šŸ“ **Subtitles (Wistia)** | āœ… Full | `wistia_downloader.py` | Multi-language caption downloads | | šŸŽÆ **Quizzes** | āœ… Basic | `downloader.py` | Structure extraction | | šŸŽØ **Presentations** | āœ… Full | FFmpeg merge | Multi-slide processing | @@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Resume Support** - Skip existing files, continue interrupted downloads - **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.) +- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages - **Comprehensive Logging** - Debug mode for troubleshooting ### šŸ›”ļø **Safety & Compliance** @@ -201,6 +203,7 @@ RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited) VALIDATE_DOWNLOADS=true # Enable file integrity validation RESUME_PARTIAL=true # Enable resume for partial downloads DEBUG=false # Enable debug logging +SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available # =============================================== # ADVANCED SETTINGS diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py index c1e1286..ab75fd4 100644 --- a/thinkific_downloader/config.py +++ b/thinkific_downloader/config.py @@ -37,6 +37,7 @@ class Settings: resume_partial: bool = True debug: bool = False course_name: str = "Course" + subtitle_download_enabled: bool = True @classmethod def from_env(cls): @@ -67,6 +68,7 @@ def from_env(cls): validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on') resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on') debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on') + subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on') # Clean cookie data to remove Unicode characters that cause encoding issues if cookie_data: @@ -101,5 +103,6 @@ def from_env(cls): download_delay=download_delay, validate_downloads=validate_downloads, resume_partial=resume_partial, - debug=debug + debug=debug, + subtitle_download_enabled=subtitle_download_enabled ) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index f2c2191..4b93d69 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -382,10 +382,74 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): add_download_task(src_url, dst_path, "file") +def _load_cached_progress(cache_file: Path): + """Return previously analyzed chapters and queued tasks from the resume cache.""" + analyzed_chapters = set() + saved_tasks: List[Dict[str, Any]] = [] + + if not cache_file.exists(): + return analyzed_chapters, saved_tasks + + try: + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + + analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) + saved_tasks = cache_data.get('download_tasks', []) + print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + + # If subtitle downloads were newly enabled, invalidate cache so we can regenerate tasks. + if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks: + has_subtitle_tasks = any( + (task.get('content_type') or '').lower() == 'subtitle' + for task in saved_tasks + ) + if not has_subtitle_tasks: + print("šŸ†• Subtitle support enabled — refreshing cached analysis to include captions.") + analyzed_chapters = set() + saved_tasks = [] + try: + cache_file.unlink() + except OSError as exc: + print(f" āš ļø Warning: Failed to delete cache file for refresh: {exc}") + except (json.JSONDecodeError, OSError): + analyzed_chapters = set() + saved_tasks = [] + + return analyzed_chapters, saved_tasks + + +def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]): + """Restore cached download tasks, respecting the subtitle feature flag.""" + if not saved_tasks: + return + + restored_tasks = list(saved_tasks) + if SETTINGS and not SETTINGS.subtitle_download_enabled: + total_tasks = len(restored_tasks) + restored_tasks = [ + task for task in restored_tasks + if (task.get('content_type') or 'video').lower() != 'subtitle' + ] + skipped_count = total_tasks - len(restored_tasks) + if skipped_count > 0: + print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") + + if not restored_tasks: + return + + print(f"šŸ“„ Restoring {len(restored_tasks)} previously collected download tasks...") + for task_data in restored_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + + def init_course(data: Dict[str, Any]): """Initialize course structure and collect ALL download tasks first.""" global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS + + # Ensure settings/download manager are initialized so feature flags are available + init_settings() # Initialize download tasks list DOWNLOAD_TASKS = [] @@ -409,17 +473,7 @@ def init_course(data: Dict[str, Any]): analyzed_chapters = set() saved_tasks = [] - if cache_file.exists(): - try: - import json - with open(cache_file, 'r', encoding='utf-8') as f: - cache_data = json.load(f) - analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) - saved_tasks = cache_data.get('download_tasks', []) - print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") - except: - analyzed_chapters = set() - saved_tasks = [] + analyzed_chapters, saved_tasks = _load_cached_progress(cache_file) # Derive base host from landing_page_url if available landing = data['course'].get('landing_page_url') @@ -430,10 +484,7 @@ def init_course(data: Dict[str, Any]): print("\nšŸ” Phase 1: Analyzing course content and collecting download links...") # Restore saved download tasks - if saved_tasks: - print(f"šŸ“„ Restoring {len(saved_tasks)} previously collected download tasks...") - for task_data in saved_tasks: - add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + _restore_saved_tasks(saved_tasks) collect_all_download_tasks(data, analyzed_chapters, cache_file) @@ -835,9 +886,24 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path): video_url = selected.get('url') if video_url: ext = '.mp4' # Default extension - resolved_name = filter_filename(file_name) + ext + resolved_name = filter_filename(file_name) + if not resolved_name.lower().endswith(ext): + resolved_name += ext print(f" šŸ“¹ Found video: {resolved_name}") add_download_task(video_url, dest_dir / resolved_name, "video") + try: + from .wistia_downloader import build_wistia_subtitle_tasks + subtitle_tasks = build_wistia_subtitle_tasks( + data.get('media') or {}, + dest_dir, + resolved_name, + SETTINGS, + ) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {Path(task['dest_path']).name}") + add_download_task(task['url'], Path(task['dest_path']), task.get('content_type', 'subtitle')) + except Exception as subtitle_error: + print(f" āš ļø Unable to queue subtitles for {resolved_name}: {subtitle_error}") except Exception as e: print(f" āŒ Failed to collect Wistia video {wistia_id}: {e}") @@ -1282,4 +1348,4 @@ def main(argv: List[str]): if __name__ == '__main__': - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index f53ae65..6421f86 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -1,12 +1,14 @@ import json +import os import re -import requests import zlib -from typing import Optional, List from pathlib import Path -import os +from typing import Any, Dict, Iterable, List, Optional +from urllib.parse import urlparse + +import requests + from .file_utils import filter_filename -from .download_manager import DownloadManager # Local imports inside functions to avoid circular dependency during module import # Handles video proxy and wistia direct downloads @@ -14,6 +16,227 @@ WISTIA_JSON_URL = "https://fast.wistia.com/embed/medias/{id}.json" VIDEO_PROXY_JSONP_ID_PATTERN = re.compile(r"medias/(\w+)\.jsonp") +DEFAULT_SUBTITLE_EXTENSION = "vtt" +_LANGUAGE_SANITIZE_PATTERN = re.compile(r'[^A-Za-z0-9\-]+') + + +def _normalize_wistia_track_url(url: Optional[str]) -> Optional[str]: + """Normalize Wistia caption track URLs to absolute HTTPS URLs.""" + if not url or not isinstance(url, str): + return None + + normalized = url.strip() + if not normalized: + return None + + if normalized.startswith('//'): + normalized = f"https:{normalized}" + elif normalized.startswith('/'): + normalized = f"https://fast.wistia.com{normalized}" + elif not re.match(r'^https?://', normalized, re.IGNORECASE): + normalized = f"https://fast.wistia.com/{normalized.lstrip('/')}" + + return normalized + + +def _build_caption_url(hashed_id: Optional[str], language: Optional[str], extension: Optional[str] = None) -> Optional[str]: + """Construct a Wistia caption URL when only hashedId and language are available.""" + if not hashed_id or not language: + return None + + ext = (extension or DEFAULT_SUBTITLE_EXTENSION).lstrip('.') or DEFAULT_SUBTITLE_EXTENSION + return f"https://fast.wistia.com/embed/captions/{hashed_id}.{ext}?language={language}" + + +def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) -> str: + """Infer file extension from track URL.""" + try: + parsed = urlparse(url) + suffix = Path(parsed.path).suffix + if suffix: + return suffix.lstrip('.').lower() or fallback + except (AttributeError, TypeError): + pass + return fallback + + +def extract_wistia_subtitle_tracks(media: Dict[str, Any]) -> List[Dict[str, Optional[str]]]: + """Extract subtitle/caption track metadata from Wistia media JSON.""" + if not isinstance(media, dict): + return [] + + hashed_id = media.get('hashedId') or media.get('hashed_id') + tracks: List[Dict[str, Optional[str]]] = [] + + def add_track(url: Optional[str], language: Optional[str], label: Optional[str], ext: Optional[str]): + normalized = _normalize_wistia_track_url(url) + if not normalized and hashed_id and language: + normalized = _build_caption_url(hashed_id, language, ext) + if not normalized: + return + tracks.append({ + 'url': normalized, + 'language': language, + 'label': label, + 'ext': (ext or '').lstrip('.') or None + }) + + def collect_from_captions(caption_items: Optional[Iterable[Dict[str, Any]]]): + for track in caption_items or []: + if not isinstance(track, dict): + continue + add_track( + track.get('url') or track.get('src'), + track.get('language') or track.get('lang'), + track.get('languageName') or track.get('label') or track.get('name'), + track.get('ext') + ) + + def collect_from_text_tracks(track_items: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): + label_key_order = tuple(label_keys) + for track in track_items or []: + if not isinstance(track, dict): + continue + language = track.get('language') or track.get('lang') + label = next((track.get(key) for key in label_key_order if track.get(key)), None) + sources = track.get('sources') or [] + if sources: + for source in sources: + if not isinstance(source, dict): + continue + add_track( + source.get('url') or source.get('src'), + language, + label, + source.get('ext') or track.get('ext') + ) + else: + add_track( + track.get('url') or track.get('src'), + language, + label, + track.get('ext') + ) + + def collect_from_assets(asset_items: Optional[Iterable[Dict[str, Any]]]): + subtitle_flags = {'caption', 'captions', 'subtitle', 'subtitles'} + for asset in asset_items or []: + if not isinstance(asset, dict): + continue + asset_type = (asset.get('type') or '').lower() + asset_kind = (asset.get('kind') or '').lower() + if asset_type in subtitle_flags or asset_kind in subtitle_flags: + add_track( + asset.get('url') or asset.get('src'), + asset.get('language') or asset.get('lang'), + asset.get('display_name') or asset.get('name'), + asset.get('ext') + ) + + def collect_from_transcripts(transcripts: Optional[Iterable[Dict[str, Any]]]): + if not hashed_id: + return + for transcript in transcripts or []: + if not isinstance(transcript, dict) or not transcript.get('hasCaptions'): + continue + language = ( + transcript.get('language') + or transcript.get('wistiaLanguageCode') + or transcript.get('bcp47LanguageTag') + ) + if not language: + continue + add_track( + _build_caption_url(hashed_id, language, DEFAULT_SUBTITLE_EXTENSION), + language, + transcript.get('name') or transcript.get('familyName') or language, + DEFAULT_SUBTITLE_EXTENSION + ) + + collect_from_captions(media.get('captions')) + collect_from_text_tracks(media.get('text_tracks'), ('name', 'label')) + collect_from_text_tracks(media.get('textTracks'), ('name', 'label', 'title')) + collect_from_assets(media.get('assets')) + collect_from_transcripts(media.get('availableTranscripts')) + + unique_tracks: Dict[str, Dict[str, Optional[str]]] = {} + for track in tracks: + url = track['url'] + if not url: + continue + if url not in unique_tracks: + unique_tracks[url] = track + else: + existing = unique_tracks[url] + # Prefer track data that includes language/label/ext + if not existing.get('language') and track.get('language'): + existing['language'] = track['language'] + if not existing.get('label') and track.get('label'): + existing['label'] = track['label'] + if not existing.get('ext') and track.get('ext'): + existing['ext'] = track['ext'] + + return list(unique_tracks.values()) + + +def build_wistia_subtitle_tasks( + media: Dict[str, Any], + dest_dir: Path, + video_base_name: str, + settings: Optional[Any] = None, +) -> List[Dict[str, Any]]: + """Construct subtitle download task dicts for a Wistia media object.""" + if not isinstance(dest_dir, Path): + dest_dir = Path(dest_dir) + + if settings and not getattr(settings, 'subtitle_download_enabled', True): + return [] + + tracks = extract_wistia_subtitle_tracks(media) + if not tracks: + return [] + + base_name = Path(video_base_name).stem + if not base_name: + fallback_name = media.get('name') or media.get('hashedId') or 'captions' + base_name = filter_filename(str(fallback_name)) + else: + base_name = filter_filename(base_name) + + if not base_name: + base_name = "captions" + + tasks: List[Dict[str, Any]] = [] + counter = 1 + for track in tracks: + url = track.get('url') + if not url: + continue + + ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION + language_raw = track.get('language') or track.get('label') + if isinstance(language_raw, str): + language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_raw).strip('-') + else: + language_part = '' + + if not language_part: + language_part = 'captions' if counter == 1 else f"captions-{counter}" + + subtitle_filename = filter_filename(f"{base_name}.{language_part}.{ext}") + if not subtitle_filename: + subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}") + + tasks.append({ + 'url': url, + 'dest_path': dest_dir / subtitle_filename, + 'content_type': 'subtitle', + 'label': track.get('label'), + 'language': track.get('language'), + }) + counter += 1 + + return tasks def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"): @@ -143,6 +366,7 @@ def infer_ext(asset: dict) -> str: return '.mp4' resolved_base = filter_filename(file_name if file_name else media.get('name') or wistia_id) + current_dir = Path.cwd() if all_formats_flag: print(f"Downloading all available Wistia assets for {resolved_base}") @@ -172,6 +396,11 @@ def infer_ext(asset: dict) -> str: DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name))) else: print("Download manager not initialized") + from .downloader import SETTINGS, add_download_task + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_base, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle')) return # Single quality path @@ -199,7 +428,10 @@ def infer_ext(asset: dict) -> str: print(f"URL : {video_url}\nFile Name : {resolved_name}") # Queue video for parallel download with absolute path to current directory - from .downloader import add_download_task - current_dir = Path.cwd() # Capture current working directory + from .downloader import SETTINGS, add_download_task full_path = current_dir / resolved_name # Create absolute path add_download_task(video_url, full_path, "video") + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_name, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle'))