From a4dca4a8ec66ec9c77eb5cae447585305899562b Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 10:25:10 -0400 Subject: [PATCH 01/19] feat(wistia): add subtitle download support --- .env.example | 5 +- README.md | 3 + thinkific_downloader/config.py | 5 +- thinkific_downloader/downloader.py | 29 ++- thinkific_downloader/wistia_downloader.py | 226 +++++++++++++++++++++- 5 files changed, 259 insertions(+), 9 deletions(-) diff --git a/.env.example b/.env.example index 654f1ab..32056dc 100644 --- a/.env.example +++ b/.env.example @@ -61,6 +61,9 @@ RESUME_PARTIAL=true # Enable detailed logging for troubleshooting DEBUG=false +# Download subtitles/captions when available (default: true) +SUBTITLE_DOWNLOAD_ENABLED=true + # =============================================== # ADVANCED SETTINGS # =============================================== @@ -83,4 +86,4 @@ COURSE_DATA_FILE="" # ALL_VIDEO_FORMATS=false # Log level (DEBUG, INFO, WARNING, ERROR) -# LOG_LEVEL="INFO" \ No newline at end of file +# LOG_LEVEL="INFO" diff --git a/README.md b/README.md index a05530a..925986d 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor | šŸ“„ **HTML Content** | āœ… Full | `downloader.py` | Clean extraction, formatting | | šŸ“š **PDF Documents** | āœ… Full | `downloader.py` | Direct download, validation | | šŸŽµ **Audio Files** | āœ… Full | `downloader.py` | MP3, M4A support | +| šŸ“ **Subtitles (Wistia)** | āœ… Full | `wistia_downloader.py` | Multi-language caption downloads | | šŸŽÆ **Quizzes** | āœ… Basic | `downloader.py` | Structure extraction | | šŸŽØ **Presentations** | āœ… Full | FFmpeg merge | Multi-slide processing | @@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Resume Support** - Skip existing files, continue interrupted downloads - **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.) +- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages - **Comprehensive Logging** - Debug mode for troubleshooting ### šŸ›”ļø **Safety & Compliance** @@ -201,6 +203,7 @@ RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited) VALIDATE_DOWNLOADS=true # Enable file integrity validation RESUME_PARTIAL=true # Enable resume for partial downloads DEBUG=false # Enable debug logging +SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available # =============================================== # ADVANCED SETTINGS diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py index c1e1286..ab75fd4 100644 --- a/thinkific_downloader/config.py +++ b/thinkific_downloader/config.py @@ -37,6 +37,7 @@ class Settings: resume_partial: bool = True debug: bool = False course_name: str = "Course" + subtitle_download_enabled: bool = True @classmethod def from_env(cls): @@ -67,6 +68,7 @@ def from_env(cls): validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on') resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on') debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on') + subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on') # Clean cookie data to remove Unicode characters that cause encoding issues if cookie_data: @@ -101,5 +103,6 @@ def from_env(cls): download_delay=download_delay, validate_downloads=validate_downloads, resume_partial=resume_partial, - debug=debug + debug=debug, + subtitle_download_enabled=subtitle_download_enabled ) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index f2c2191..a6bb286 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -386,6 +386,9 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): def init_course(data: Dict[str, Any]): """Initialize course structure and collect ALL download tasks first.""" global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS + + # Ensure settings/download manager are initialized so feature flags are available + init_settings() # Initialize download tasks list DOWNLOAD_TASKS = [] @@ -417,6 +420,21 @@ def init_course(data: Dict[str, Any]): analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) saved_tasks = cache_data.get('download_tasks', []) print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + # If subtitle downloads are enabled but cached tasks do not contain subtitles, + # treat cache as outdated so we can regenerate tasks with captions. + if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks: + has_subtitle_tasks = any( + (task.get('content_type') or '').lower() == 'subtitle' + for task in saved_tasks + ) + if not has_subtitle_tasks: + print("šŸ†• Subtitle support enabled — refreshing cached analysis to include captions.") + analyzed_chapters = set() + saved_tasks = [] + try: + cache_file.unlink() + except Exception: + pass except: analyzed_chapters = set() saved_tasks = [] @@ -835,9 +853,16 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path): video_url = selected.get('url') if video_url: ext = '.mp4' # Default extension - resolved_name = filter_filename(file_name) + ext + resolved_name = filter_filename(file_name) + if not resolved_name.lower().endswith(ext): + resolved_name += ext print(f" šŸ“¹ Found video: {resolved_name}") add_download_task(video_url, dest_dir / resolved_name, "video") + try: + from .wistia_downloader import queue_wistia_subtitle_downloads + queue_wistia_subtitle_downloads(data.get('media') or {}, dest_dir, resolved_name) + except Exception as subtitle_error: + print(f" āš ļø Unable to queue subtitles for {resolved_name}: {subtitle_error}") except Exception as e: print(f" āŒ Failed to collect Wistia video {wistia_id}: {e}") @@ -1282,4 +1307,4 @@ def main(argv: List[str]): if __name__ == '__main__': - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index f53ae65..eb4ff20 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -1,12 +1,14 @@ import json +import os import re -import requests import zlib -from typing import Optional, List from pathlib import Path -import os +from typing import Any, Dict, List, Optional +from urllib.parse import urlparse + +import requests + from .file_utils import filter_filename -from .download_manager import DownloadManager # Local imports inside functions to avoid circular dependency during module import # Handles video proxy and wistia direct downloads @@ -14,6 +16,218 @@ WISTIA_JSON_URL = "https://fast.wistia.com/embed/medias/{id}.json" VIDEO_PROXY_JSONP_ID_PATTERN = re.compile(r"medias/(\w+)\.jsonp") +DEFAULT_SUBTITLE_EXTENSION = "vtt" +_LANGUAGE_SANITIZE_PATTERN = re.compile(r'[^A-Za-z0-9\-]+') + + +def _normalize_wistia_track_url(url: Optional[str]) -> Optional[str]: + """Normalize Wistia caption track URLs to absolute HTTPS URLs.""" + if not url or not isinstance(url, str): + return None + + normalized = url.strip() + if not normalized: + return None + + if normalized.startswith('//'): + normalized = f"https:{normalized}" + elif normalized.startswith('/'): + normalized = f"https://fast.wistia.com{normalized}" + elif not re.match(r'^https?://', normalized, re.IGNORECASE): + normalized = f"https://fast.wistia.com/{normalized.lstrip('/')}" + + return normalized + + +def _build_caption_url(hashed_id: Optional[str], language: Optional[str], extension: Optional[str] = None) -> Optional[str]: + """Construct a Wistia caption URL when only hashedId and language are available.""" + if not hashed_id or not language: + return None + + ext = (extension or DEFAULT_SUBTITLE_EXTENSION).lstrip('.') or DEFAULT_SUBTITLE_EXTENSION + return f"https://fast.wistia.com/embed/captions/{hashed_id}.{ext}?language={language}" + + +def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) -> str: + """Infer file extension from track URL.""" + try: + parsed = urlparse(url) + suffix = Path(parsed.path).suffix + if suffix: + return suffix.lstrip('.').lower() or fallback + except Exception: + pass + return fallback + + +def extract_wistia_subtitle_tracks(media: Dict[str, Any]) -> List[Dict[str, Optional[str]]]: + """Extract subtitle/caption track metadata from Wistia media JSON.""" + if not isinstance(media, dict): + return [] + + hashed_id = media.get('hashedId') or media.get('hashed_id') + tracks: List[Dict[str, Optional[str]]] = [] + + def add_track(url: Optional[str], language: Optional[str], label: Optional[str], ext: Optional[str]): + normalized = _normalize_wistia_track_url(url) + if not normalized and hashed_id and language: + normalized = _build_caption_url(hashed_id, language, ext) + if not normalized: + return + tracks.append({ + 'url': normalized, + 'language': language, + 'label': label, + 'ext': (ext or '').lstrip('.') or None + }) + + for track in media.get('captions') or []: + if isinstance(track, dict): + add_track( + track.get('url') or track.get('src'), + track.get('language') or track.get('lang'), + track.get('languageName') or track.get('label') or track.get('name'), + track.get('ext') + ) + + for track in media.get('text_tracks') or []: + if not isinstance(track, dict): + continue + sources = track.get('sources') or [] + if sources: + for source in sources: + if isinstance(source, dict): + add_track( + source.get('url') or source.get('src'), + track.get('language') or track.get('lang'), + track.get('name') or track.get('label'), + source.get('ext') or track.get('ext') + ) + else: + add_track( + track.get('url') or track.get('src'), + track.get('language') or track.get('lang'), + track.get('name') or track.get('label'), + track.get('ext') + ) + + for track in media.get('textTracks') or []: + if not isinstance(track, dict): + continue + sources = track.get('sources') or [] + if sources: + for source in sources: + if isinstance(source, dict): + add_track( + source.get('url') or source.get('src'), + track.get('language') or track.get('lang'), + track.get('name') or track.get('label') or track.get('title'), + source.get('ext') or track.get('ext') + ) + else: + add_track( + track.get('url') or track.get('src'), + track.get('language') or track.get('lang'), + track.get('name') or track.get('label') or track.get('title'), + track.get('ext') + ) + + for asset in media.get('assets') or []: + if isinstance(asset, dict): + asset_type = (asset.get('type') or '').lower() + asset_kind = (asset.get('kind') or '').lower() + if asset_type in ('caption', 'captions', 'subtitle', 'subtitles') or asset_kind in ('caption', 'captions', 'subtitle', 'subtitles'): + add_track( + asset.get('url') or asset.get('src'), + asset.get('language') or asset.get('lang'), + asset.get('display_name') or asset.get('name'), + asset.get('ext') + ) + + available_transcripts = media.get('availableTranscripts') or [] + if hashed_id and available_transcripts: + for transcript in available_transcripts: + if not isinstance(transcript, dict) or not transcript.get('hasCaptions'): + continue + language = transcript.get('language') or transcript.get('wistiaLanguageCode') or transcript.get('bcp47LanguageTag') + if not language: + continue + add_track( + _build_caption_url(hashed_id, language, DEFAULT_SUBTITLE_EXTENSION), + language, + transcript.get('name') or transcript.get('familyName') or language, + DEFAULT_SUBTITLE_EXTENSION + ) + + unique_tracks: Dict[str, Dict[str, Optional[str]]] = {} + for track in tracks: + url = track['url'] + if not url: + continue + if url not in unique_tracks: + unique_tracks[url] = track + else: + existing = unique_tracks[url] + # Prefer track data that includes language/label/ext + if not existing.get('language') and track.get('language'): + existing['language'] = track['language'] + if not existing.get('label') and track.get('label'): + existing['label'] = track['label'] + if not existing.get('ext') and track.get('ext'): + existing['ext'] = track['ext'] + + return list(unique_tracks.values()) + + +def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video_base_name: str): + """Queue subtitle download tasks for a Wistia media object.""" + from .downloader import SETTINGS, add_download_task, init_settings + + if not isinstance(dest_dir, Path): + dest_dir = Path(dest_dir) + + init_settings() + settings = SETTINGS + if settings and hasattr(settings, 'subtitle_download_enabled') and not settings.subtitle_download_enabled: + return + + tracks = extract_wistia_subtitle_tracks(media) + if not tracks: + return + + base_name = Path(video_base_name).stem + if not base_name: + fallback_name = media.get('name') or media.get('hashedId') or 'captions' + base_name = filter_filename(str(fallback_name)) + else: + base_name = filter_filename(base_name) + + if not base_name: + base_name = "captions" + + counter = 1 + for track in tracks: + url = track.get('url') + if not url: + continue + + ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION + language_part = track.get('language') or track.get('label') or '' + if isinstance(language_part, (list, dict)): + language_part = '' + language_part = str(language_part or '') + language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_part).strip('-') + + if not language_part: + language_part = 'captions' if counter == 1 else f"captions-{counter}" + + subtitle_filename = filter_filename(f"{base_name}.{language_part}.{ext}") + if not subtitle_filename: + subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}") + + print(f" [Subs] Queued subtitles: {subtitle_filename}") + add_download_task(url, dest_dir / subtitle_filename, "subtitle") + counter += 1 def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"): @@ -143,6 +357,7 @@ def infer_ext(asset: dict) -> str: return '.mp4' resolved_base = filter_filename(file_name if file_name else media.get('name') or wistia_id) + current_dir = Path.cwd() if all_formats_flag: print(f"Downloading all available Wistia assets for {resolved_base}") @@ -172,6 +387,7 @@ def infer_ext(asset: dict) -> str: DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name))) else: print("Download manager not initialized") + queue_wistia_subtitle_downloads(media, current_dir, resolved_base) return # Single quality path @@ -200,6 +416,6 @@ def infer_ext(asset: dict) -> str: # Queue video for parallel download with absolute path to current directory from .downloader import add_download_task - current_dir = Path.cwd() # Capture current working directory full_path = current_dir / resolved_name # Create absolute path add_download_task(video_url, full_path, "video") + queue_wistia_subtitle_downloads(media, current_dir, resolved_name) From f2cd39d9ce037d0aafbee768e6c322e842ffd5a9 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:05:20 -0400 Subject: [PATCH 02/19] Respect subtitle flag when restoring cached tasks --- thinkific_downloader/downloader.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index a6bb286..91f3771 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -449,9 +449,23 @@ def init_course(data: Dict[str, Any]): # Restore saved download tasks if saved_tasks: - print(f"šŸ“„ Restoring {len(saved_tasks)} previously collected download tasks...") - for task_data in saved_tasks: - add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + restored_tasks = saved_tasks + if SETTINGS and hasattr(SETTINGS, 'subtitle_download_enabled') and not SETTINGS.subtitle_download_enabled: + filtered_tasks = [] + skipped_count = 0 + for task in saved_tasks: + content_type = (task.get('content_type') or 'video').lower() + if content_type == 'subtitle': + skipped_count += 1 + continue + filtered_tasks.append(task) + restored_tasks = filtered_tasks + if skipped_count: + print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") + if restored_tasks: + print(f"šŸ“„ Restoring {len(restored_tasks)} previously collected download tasks...") + for task_data in restored_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) collect_all_download_tasks(data, analyzed_chapters, cache_file) From 80129f80607dcf5ab56e3f6d33ac30302e6cf4ae Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:05:40 -0400 Subject: [PATCH 03/19] Update thinkific_downloader/downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 91f3771..69ea6a5 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -435,7 +435,7 @@ def init_course(data: Dict[str, Any]): cache_file.unlink() except Exception: pass - except: + except (json.JSONDecodeError, OSError): analyzed_chapters = set() saved_tasks = [] From 7f4a58abd5b2815b3a278b13ab07d5aefc532356 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:05:57 -0400 Subject: [PATCH 04/19] Update thinkific_downloader/downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 69ea6a5..5900403 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -433,7 +433,7 @@ def init_course(data: Dict[str, Any]): saved_tasks = [] try: cache_file.unlink() - except Exception: + except OSError: pass except (json.JSONDecodeError, OSError): analyzed_chapters = set() From 60f827370954899cab6d707901cab6a060691ae1 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:06:11 -0400 Subject: [PATCH 05/19] Update thinkific_downloader/wistia_downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/wistia_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index eb4ff20..b9832ce 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -55,7 +55,7 @@ def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) suffix = Path(parsed.path).suffix if suffix: return suffix.lstrip('.').lower() or fallback - except Exception: + except (AttributeError, TypeError): pass return fallback From e02aca2d95eeca201077bc6917b20979e103de72 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:08:11 -0400 Subject: [PATCH 06/19] Refactor Wistia track extraction helper --- thinkific_downloader/wistia_downloader.py | 72 +++++++++++------------ 1 file changed, 35 insertions(+), 37 deletions(-) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index b9832ce..7cdfc17 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -3,7 +3,7 @@ import re import zlib from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Optional from urllib.parse import urlparse import requests @@ -90,47 +90,45 @@ def add_track(url: Optional[str], language: Optional[str], label: Optional[str], track.get('ext') ) - for track in media.get('text_tracks') or []: - if not isinstance(track, dict): - continue - sources = track.get('sources') or [] - if sources: - for source in sources: - if isinstance(source, dict): - add_track( - source.get('url') or source.get('src'), - track.get('language') or track.get('lang'), - track.get('name') or track.get('label'), - source.get('ext') or track.get('ext') - ) - else: - add_track( - track.get('url') or track.get('src'), - track.get('language') or track.get('lang'), - track.get('name') or track.get('label'), - track.get('ext') - ) + def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): + if not collection: + return - for track in media.get('textTracks') or []: - if not isinstance(track, dict): - continue - sources = track.get('sources') or [] - if sources: - for source in sources: - if isinstance(source, dict): + def _get_label(track_dict: Dict[str, Any]) -> Optional[str]: + for key in label_keys: + value = track_dict.get(key) + if value: + return value + return None + + for track in collection: + if not isinstance(track, dict): + continue + + language = track.get('language') or track.get('lang') + label = _get_label(track) + sources = track.get('sources') or [] + + if sources: + for source in sources: + if not isinstance(source, dict): + continue add_track( source.get('url') or source.get('src'), - track.get('language') or track.get('lang'), - track.get('name') or track.get('label') or track.get('title'), + language, + label, source.get('ext') or track.get('ext') ) - else: - add_track( - track.get('url') or track.get('src'), - track.get('language') or track.get('lang'), - track.get('name') or track.get('label') or track.get('title'), - track.get('ext') - ) + else: + add_track( + track.get('url') or track.get('src'), + language, + label, + track.get('ext') + ) + + process_track_collection(media.get('text_tracks'), ('name', 'label')) + process_track_collection(media.get('textTracks'), ('name', 'label', 'title')) for asset in media.get('assets') or []: if isinstance(asset, dict): From 72573acd9ed17f1f601d2908fbcaaa6e46d7a134 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:28:11 -0400 Subject: [PATCH 07/19] Factor cache restore helpers --- thinkific_downloader/downloader.py | 110 +++++++++++++++++------------ 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 5900403..d086bd7 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -382,6 +382,70 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): add_download_task(src_url, dst_path, "file") +def _load_cached_progress(cache_file: Path): + """Return previously analyzed chapters and queued tasks from the resume cache.""" + analyzed_chapters = set() + saved_tasks: List[Dict[str, Any]] = [] + + if not cache_file.exists(): + return analyzed_chapters, saved_tasks + + try: + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + + analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) + saved_tasks = cache_data.get('download_tasks', []) + print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + + # If subtitle downloads were newly enabled, invalidate cache so we can regenerate tasks. + if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks: + has_subtitle_tasks = any( + (task.get('content_type') or '').lower() == 'subtitle' + for task in saved_tasks + ) + if not has_subtitle_tasks: + print("šŸ†• Subtitle support enabled — refreshing cached analysis to include captions.") + analyzed_chapters = set() + saved_tasks = [] + try: + cache_file.unlink() + except OSError: + pass + except (json.JSONDecodeError, OSError): + analyzed_chapters = set() + saved_tasks = [] + + return analyzed_chapters, saved_tasks + + +def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]): + """Restore cached download tasks, respecting the subtitle feature flag.""" + if not saved_tasks: + return + + restored_tasks = saved_tasks + if SETTINGS and hasattr(SETTINGS, 'subtitle_download_enabled') and not SETTINGS.subtitle_download_enabled: + filtered_tasks: List[Dict[str, Any]] = [] + skipped_count = 0 + for task in saved_tasks: + content_type = (task.get('content_type') or 'video').lower() + if content_type == 'subtitle': + skipped_count += 1 + continue + filtered_tasks.append(task) + restored_tasks = filtered_tasks + if skipped_count: + print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") + + if not restored_tasks: + return + + print(f"šŸ“„ Restoring {len(restored_tasks)} previously collected download tasks...") + for task_data in restored_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + + def init_course(data: Dict[str, Any]): """Initialize course structure and collect ALL download tasks first.""" @@ -412,32 +476,7 @@ def init_course(data: Dict[str, Any]): analyzed_chapters = set() saved_tasks = [] - if cache_file.exists(): - try: - import json - with open(cache_file, 'r', encoding='utf-8') as f: - cache_data = json.load(f) - analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) - saved_tasks = cache_data.get('download_tasks', []) - print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") - # If subtitle downloads are enabled but cached tasks do not contain subtitles, - # treat cache as outdated so we can regenerate tasks with captions. - if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks: - has_subtitle_tasks = any( - (task.get('content_type') or '').lower() == 'subtitle' - for task in saved_tasks - ) - if not has_subtitle_tasks: - print("šŸ†• Subtitle support enabled — refreshing cached analysis to include captions.") - analyzed_chapters = set() - saved_tasks = [] - try: - cache_file.unlink() - except OSError: - pass - except (json.JSONDecodeError, OSError): - analyzed_chapters = set() - saved_tasks = [] + analyzed_chapters, saved_tasks = _load_cached_progress(cache_file) # Derive base host from landing_page_url if available landing = data['course'].get('landing_page_url') @@ -448,24 +487,7 @@ def init_course(data: Dict[str, Any]): print("\nšŸ” Phase 1: Analyzing course content and collecting download links...") # Restore saved download tasks - if saved_tasks: - restored_tasks = saved_tasks - if SETTINGS and hasattr(SETTINGS, 'subtitle_download_enabled') and not SETTINGS.subtitle_download_enabled: - filtered_tasks = [] - skipped_count = 0 - for task in saved_tasks: - content_type = (task.get('content_type') or 'video').lower() - if content_type == 'subtitle': - skipped_count += 1 - continue - filtered_tasks.append(task) - restored_tasks = filtered_tasks - if skipped_count: - print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") - if restored_tasks: - print(f"šŸ“„ Restoring {len(restored_tasks)} previously collected download tasks...") - for task_data in restored_tasks: - add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + _restore_saved_tasks(saved_tasks) collect_all_download_tasks(data, analyzed_chapters, cache_file) From bd00862dc1d2a86a0a1e606f0dcecfd11088c37a Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 12:30:28 -0400 Subject: [PATCH 08/19] Decompose Wistia track processing helpers --- thinkific_downloader/wistia_downloader.py | 57 ++++++++++------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index 7cdfc17..fbf5694 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -90,42 +90,33 @@ def add_track(url: Optional[str], language: Optional[str], label: Optional[str], track.get('ext') ) - def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): - if not collection: - return - - def _get_label(track_dict: Dict[str, Any]) -> Optional[str]: - for key in label_keys: - value = track_dict.get(key) - if value: - return value - return None + def iter_track_dicts(collection: Optional[Iterable[Dict[str, Any]]]): + for item in collection or []: + if isinstance(item, dict): + yield item + + def extract_label(track_dict: Dict[str, Any], label_keys: Iterable[str]) -> Optional[str]: + for key in label_keys: + value = track_dict.get(key) + if value: + return value + return None - for track in collection: - if not isinstance(track, dict): - continue + def iter_track_sources(track_dict: Dict[str, Any]): + sources = track_dict.get('sources') or [] + if not sources: + yield track_dict.get('url') or track_dict.get('src'), track_dict.get('ext') + return + for source in sources: + if isinstance(source, dict): + yield source.get('url') or source.get('src'), source.get('ext') or track_dict.get('ext') + def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): + for track in iter_track_dicts(collection): language = track.get('language') or track.get('lang') - label = _get_label(track) - sources = track.get('sources') or [] - - if sources: - for source in sources: - if not isinstance(source, dict): - continue - add_track( - source.get('url') or source.get('src'), - language, - label, - source.get('ext') or track.get('ext') - ) - else: - add_track( - track.get('url') or track.get('src'), - language, - label, - track.get('ext') - ) + label = extract_label(track, label_keys) + for source_url, source_ext in iter_track_sources(track): + add_track(source_url, language, label, source_ext) process_track_collection(media.get('text_tracks'), ('name', 'label')) process_track_collection(media.get('textTracks'), ('name', 'label', 'title')) From c705cc3c4c431a85fbedb75a85b1556b8eb7dcf0 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 13:05:22 -0400 Subject: [PATCH 09/19] Update thinkific_downloader/downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/downloader.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index d086bd7..870647f 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -425,17 +425,14 @@ def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]): return restored_tasks = saved_tasks - if SETTINGS and hasattr(SETTINGS, 'subtitle_download_enabled') and not SETTINGS.subtitle_download_enabled: - filtered_tasks: List[Dict[str, Any]] = [] - skipped_count = 0 - for task in saved_tasks: - content_type = (task.get('content_type') or 'video').lower() - if content_type == 'subtitle': - skipped_count += 1 - continue - filtered_tasks.append(task) - restored_tasks = filtered_tasks - if skipped_count: + if SETTINGS and not SETTINGS.subtitle_download_enabled: + all_tasks_count = len(restored_tasks) + restored_tasks = [ + task for task in restored_tasks + if (task.get('content_type') or 'video').lower() != 'subtitle' + ] + skipped_count = all_tasks_count - len(restored_tasks) + if skipped_count > 0: print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") if not restored_tasks: From 08b0129d7017ae37c4f2ab23a262a9094f9f57b3 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 13:06:05 -0400 Subject: [PATCH 10/19] Update thinkific_downloader/wistia_downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/wistia_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index fbf5694..f64d02c 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -125,7 +125,7 @@ def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], lab if isinstance(asset, dict): asset_type = (asset.get('type') or '').lower() asset_kind = (asset.get('kind') or '').lower() - if asset_type in ('caption', 'captions', 'subtitle', 'subtitles') or asset_kind in ('caption', 'captions', 'subtitle', 'subtitles'): + if asset_type in {'caption', 'captions', 'subtitle', 'subtitles'} or asset_kind in {'caption', 'captions', 'subtitle', 'subtitles'}: add_track( asset.get('url') or asset.get('src'), asset.get('language') or asset.get('lang'), From 47192c43ad6251034f904ccf9f9da0adfdb1d72b Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 13:06:26 -0400 Subject: [PATCH 11/19] Update thinkific_downloader/downloader.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- thinkific_downloader/downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index 870647f..d389817 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -410,8 +410,8 @@ def _load_cached_progress(cache_file: Path): saved_tasks = [] try: cache_file.unlink() - except OSError: - pass + except OSError as e: + print(f" āš ļø Warning: Failed to delete cache file for refresh: {e}") except (json.JSONDecodeError, OSError): analyzed_chapters = set() saved_tasks = [] From 98112af0031924e8b40c99f6cafe52eb360e4a94 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 13:13:51 -0400 Subject: [PATCH 12/19] Improve resume cache handling for subtitles --- thinkific_downloader/downloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index d389817..a79599d 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -410,8 +410,8 @@ def _load_cached_progress(cache_file: Path): saved_tasks = [] try: cache_file.unlink() - except OSError as e: - print(f" āš ļø Warning: Failed to delete cache file for refresh: {e}") + except OSError as exc: + print(f" āš ļø Warning: Failed to delete cache file for refresh: {exc}") except (json.JSONDecodeError, OSError): analyzed_chapters = set() saved_tasks = [] @@ -424,14 +424,14 @@ def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]): if not saved_tasks: return - restored_tasks = saved_tasks + restored_tasks = list(saved_tasks) if SETTINGS and not SETTINGS.subtitle_download_enabled: - all_tasks_count = len(restored_tasks) + total_tasks = len(restored_tasks) restored_tasks = [ task for task in restored_tasks if (task.get('content_type') or 'video').lower() != 'subtitle' ] - skipped_count = all_tasks_count - len(restored_tasks) + skipped_count = total_tasks - len(restored_tasks) if skipped_count > 0: print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") From 462b3b6fab583a836e0b2f22557a9e20c47974aa Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 13:13:59 -0400 Subject: [PATCH 13/19] Return Wistia subtitle tasks for callers --- thinkific_downloader/downloader.py | 12 +- thinkific_downloader/wistia_downloader.py | 139 +++++++++++++--------- 2 files changed, 93 insertions(+), 58 deletions(-) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index a79599d..4b93d69 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -892,8 +892,16 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path): print(f" šŸ“¹ Found video: {resolved_name}") add_download_task(video_url, dest_dir / resolved_name, "video") try: - from .wistia_downloader import queue_wistia_subtitle_downloads - queue_wistia_subtitle_downloads(data.get('media') or {}, dest_dir, resolved_name) + from .wistia_downloader import build_wistia_subtitle_tasks + subtitle_tasks = build_wistia_subtitle_tasks( + data.get('media') or {}, + dest_dir, + resolved_name, + SETTINGS, + ) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {Path(task['dest_path']).name}") + add_download_task(task['url'], Path(task['dest_path']), task.get('content_type', 'subtitle')) except Exception as subtitle_error: print(f" āš ļø Unable to queue subtitles for {resolved_name}: {subtitle_error}") except Exception as e: diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index f64d02c..6421f86 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -81,8 +81,10 @@ def add_track(url: Optional[str], language: Optional[str], label: Optional[str], 'ext': (ext or '').lstrip('.') or None }) - for track in media.get('captions') or []: - if isinstance(track, dict): + def collect_from_captions(caption_items: Optional[Iterable[Dict[str, Any]]]): + for track in caption_items or []: + if not isinstance(track, dict): + continue add_track( track.get('url') or track.get('src'), track.get('language') or track.get('lang'), @@ -90,42 +92,40 @@ def add_track(url: Optional[str], language: Optional[str], label: Optional[str], track.get('ext') ) - def iter_track_dicts(collection: Optional[Iterable[Dict[str, Any]]]): - for item in collection or []: - if isinstance(item, dict): - yield item - - def extract_label(track_dict: Dict[str, Any], label_keys: Iterable[str]) -> Optional[str]: - for key in label_keys: - value = track_dict.get(key) - if value: - return value - return None - - def iter_track_sources(track_dict: Dict[str, Any]): - sources = track_dict.get('sources') or [] - if not sources: - yield track_dict.get('url') or track_dict.get('src'), track_dict.get('ext') - return - for source in sources: - if isinstance(source, dict): - yield source.get('url') or source.get('src'), source.get('ext') or track_dict.get('ext') - - def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): - for track in iter_track_dicts(collection): + def collect_from_text_tracks(track_items: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): + label_key_order = tuple(label_keys) + for track in track_items or []: + if not isinstance(track, dict): + continue language = track.get('language') or track.get('lang') - label = extract_label(track, label_keys) - for source_url, source_ext in iter_track_sources(track): - add_track(source_url, language, label, source_ext) - - process_track_collection(media.get('text_tracks'), ('name', 'label')) - process_track_collection(media.get('textTracks'), ('name', 'label', 'title')) + label = next((track.get(key) for key in label_key_order if track.get(key)), None) + sources = track.get('sources') or [] + if sources: + for source in sources: + if not isinstance(source, dict): + continue + add_track( + source.get('url') or source.get('src'), + language, + label, + source.get('ext') or track.get('ext') + ) + else: + add_track( + track.get('url') or track.get('src'), + language, + label, + track.get('ext') + ) - for asset in media.get('assets') or []: - if isinstance(asset, dict): + def collect_from_assets(asset_items: Optional[Iterable[Dict[str, Any]]]): + subtitle_flags = {'caption', 'captions', 'subtitle', 'subtitles'} + for asset in asset_items or []: + if not isinstance(asset, dict): + continue asset_type = (asset.get('type') or '').lower() asset_kind = (asset.get('kind') or '').lower() - if asset_type in {'caption', 'captions', 'subtitle', 'subtitles'} or asset_kind in {'caption', 'captions', 'subtitle', 'subtitles'}: + if asset_type in subtitle_flags or asset_kind in subtitle_flags: add_track( asset.get('url') or asset.get('src'), asset.get('language') or asset.get('lang'), @@ -133,12 +133,17 @@ def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], lab asset.get('ext') ) - available_transcripts = media.get('availableTranscripts') or [] - if hashed_id and available_transcripts: - for transcript in available_transcripts: + def collect_from_transcripts(transcripts: Optional[Iterable[Dict[str, Any]]]): + if not hashed_id: + return + for transcript in transcripts or []: if not isinstance(transcript, dict) or not transcript.get('hasCaptions'): continue - language = transcript.get('language') or transcript.get('wistiaLanguageCode') or transcript.get('bcp47LanguageTag') + language = ( + transcript.get('language') + or transcript.get('wistiaLanguageCode') + or transcript.get('bcp47LanguageTag') + ) if not language: continue add_track( @@ -148,6 +153,12 @@ def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], lab DEFAULT_SUBTITLE_EXTENSION ) + collect_from_captions(media.get('captions')) + collect_from_text_tracks(media.get('text_tracks'), ('name', 'label')) + collect_from_text_tracks(media.get('textTracks'), ('name', 'label', 'title')) + collect_from_assets(media.get('assets')) + collect_from_transcripts(media.get('availableTranscripts')) + unique_tracks: Dict[str, Dict[str, Optional[str]]] = {} for track in tracks: url = track['url'] @@ -168,21 +179,22 @@ def process_track_collection(collection: Optional[Iterable[Dict[str, Any]]], lab return list(unique_tracks.values()) -def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video_base_name: str): - """Queue subtitle download tasks for a Wistia media object.""" - from .downloader import SETTINGS, add_download_task, init_settings - +def build_wistia_subtitle_tasks( + media: Dict[str, Any], + dest_dir: Path, + video_base_name: str, + settings: Optional[Any] = None, +) -> List[Dict[str, Any]]: + """Construct subtitle download task dicts for a Wistia media object.""" if not isinstance(dest_dir, Path): dest_dir = Path(dest_dir) - init_settings() - settings = SETTINGS - if settings and hasattr(settings, 'subtitle_download_enabled') and not settings.subtitle_download_enabled: - return + if settings and not getattr(settings, 'subtitle_download_enabled', True): + return [] tracks = extract_wistia_subtitle_tracks(media) if not tracks: - return + return [] base_name = Path(video_base_name).stem if not base_name: @@ -194,6 +206,7 @@ def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video if not base_name: base_name = "captions" + tasks: List[Dict[str, Any]] = [] counter = 1 for track in tracks: url = track.get('url') @@ -201,11 +214,11 @@ def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video continue ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION - language_part = track.get('language') or track.get('label') or '' - if isinstance(language_part, (list, dict)): + language_raw = track.get('language') or track.get('label') + if isinstance(language_raw, str): + language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_raw).strip('-') + else: language_part = '' - language_part = str(language_part or '') - language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_part).strip('-') if not language_part: language_part = 'captions' if counter == 1 else f"captions-{counter}" @@ -214,10 +227,17 @@ def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video if not subtitle_filename: subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}") - print(f" [Subs] Queued subtitles: {subtitle_filename}") - add_download_task(url, dest_dir / subtitle_filename, "subtitle") + tasks.append({ + 'url': url, + 'dest_path': dest_dir / subtitle_filename, + 'content_type': 'subtitle', + 'label': track.get('label'), + 'language': track.get('language'), + }) counter += 1 + return tasks + def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"): from .downloader import http_get # delayed import @@ -376,7 +396,11 @@ def infer_ext(asset: dict) -> str: DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name))) else: print("Download manager not initialized") - queue_wistia_subtitle_downloads(media, current_dir, resolved_base) + from .downloader import SETTINGS, add_download_task + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_base, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle')) return # Single quality path @@ -404,7 +428,10 @@ def infer_ext(asset: dict) -> str: print(f"URL : {video_url}\nFile Name : {resolved_name}") # Queue video for parallel download with absolute path to current directory - from .downloader import add_download_task + from .downloader import SETTINGS, add_download_task full_path = current_dir / resolved_name # Create absolute path add_download_task(video_url, full_path, "video") - queue_wistia_subtitle_downloads(media, current_dir, resolved_name) + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_name, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle')) From 750d3614ab9c7fc6b4426210c70d7e10acdb8cb1 Mon Sep 17 00:00:00 2001 From: Oleksiy Kovyrin Date: Tue, 14 Oct 2025 17:49:38 -0400 Subject: [PATCH 14/19] Add PRD and tasks for offline course viewer --- docs/tasks/prd-local-course-viewer.md | 85 +++++++++++++++++++++ docs/tasks/tasks-prd-local-course-viewer.md | 64 ++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 docs/tasks/prd-local-course-viewer.md create mode 100644 docs/tasks/tasks-prd-local-course-viewer.md diff --git a/docs/tasks/prd-local-course-viewer.md b/docs/tasks/prd-local-course-viewer.md new file mode 100644 index 0000000..35173c8 --- /dev/null +++ b/docs/tasks/prd-local-course-viewer.md @@ -0,0 +1,85 @@ +# Local Course Viewer PRD + +## 1. Introduction / Overview +Create a Python-based generator that turns a downloaded Thinkific course into a self-contained static website for offline consumption. The script should read the provided course metadata JSON (e.g., `beginner-chess-mastery.json`) and the corresponding assets already stored under `downloads//`, validate that everything needed is present, and produce an easy-to-navigate two-pane interface. The generated site must work when opened directly from the filesystem (no server) and allow a learner to browse chapters, play videos, and read text lessons completely offline. + +## 2. Goals +- Provide a one-command workflow that accepts a Thinkific course metadata JSON file and emits an offline-ready static site in the matching `downloads//` directory. +- Mirror the course hierarchy (chapters → lessons) in a left-hand navigation tree with quick access to each lesson. +- Render lesson content appropriately in the main pane: embedded video playback (with captions) for video lessons, and readable formatted text for HTML lessons. +- Package all required assets (CSS, JS, fonts) locally so the experience works without network access. + +## 3. User Stories +1. **As a learner traveling without reliable internet**, I want to open `downloads//index.html` and continue the course offline, so I can make use of the content anywhere. +2. **As a downloader maintainer**, I want the generator to fail fast if lesson assets are missing, so I can fix gaps before distributing the course dump. +3. **As a learner**, I want to jump between lessons quickly using a chapter tree, so I can find specific topics without scrolling through a long page. +4. **As a learner**, I want video lessons to include captions when available, so I can follow along in noisy environments. +5. **As a learner**, I want links to attachments (e.g., PDFs) surfaced with each lesson, so I can access supporting materials. + +## 4. Functional Requirements +1. **CLI entrypoint** + - Provide a Python command (e.g., `python -m thinkific_downloader.generate_site `) that accepts at minimum: path to the metadata JSON, optional `--downloads-dir` override (default `downloads/`), and optional `--output-subdir` name (default the course slug). +2. **Metadata ingestion and validation** + - Parse the JSON and confirm required keys exist (`course.slug`, `chapters`, `contents`). + - Build an in-memory course model linking chapters to lesson content via IDs. + - Emit actionable errors when the JSON structure is unexpected. +3. **Asset validation** + - Locate the base course folder at `downloads//` (configurable via CLI). + - For each lesson, verify the expected asset directory exists (matching lesson slug or already-downloaded folder naming). + - Confirm that required primary assets exist: `.mp4` for videos, `.html` for text lessons, plus optional assets (`.vtt`, PDFs, images). + - Surface a consolidated report of missing assets before generation. +4. **Output structure** + - Generate a static site rooted at `downloads//index.html`. + - Place shared assets under a subfolder (e.g., `downloads//site-assets/`) containing CSS, JS, icons, and fonts (if any). + - Preserve or reuse existing lesson folders; do not modify original media files. +5. **Navigation UI** + - Render an always-expanded chapter list in the left sidebar reflecting chapter order (`position`) without collapse/expand controls. + - List lessons within each chapter in order, distinguishing lesson types (video vs text) with an icon or label. + - Highlight the currently selected lesson and keep the selection in sync when switching content. +6. **Lesson rendering** + - For video lessons, embed the local `.mp4` via `