diff --git a/.env.example b/.env.example index 654f1ab..32056dc 100644 --- a/.env.example +++ b/.env.example @@ -61,6 +61,9 @@ RESUME_PARTIAL=true # Enable detailed logging for troubleshooting DEBUG=false +# Download subtitles/captions when available (default: true) +SUBTITLE_DOWNLOAD_ENABLED=true + # =============================================== # ADVANCED SETTINGS # =============================================== @@ -83,4 +86,4 @@ COURSE_DATA_FILE="" # ALL_VIDEO_FORMATS=false # Log level (DEBUG, INFO, WARNING, ERROR) -# LOG_LEVEL="INFO" \ No newline at end of file +# LOG_LEVEL="INFO" diff --git a/.gitignore b/.gitignore index a7fc79a..bb7c60f 100644 --- a/.gitignore +++ b/.gitignore @@ -85,6 +85,9 @@ downloads/ *.mkv ffmpeg.log +# Allow HTML templates used by the offline site generator +!thinkific_downloader/templates/*.html + # But allow certain JSON files !package.json !requirements.json diff --git a/README.md b/README.md index a05530a..925986d 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor | šŸ“„ **HTML Content** | āœ… Full | `downloader.py` | Clean extraction, formatting | | šŸ“š **PDF Documents** | āœ… Full | `downloader.py` | Direct download, validation | | šŸŽµ **Audio Files** | āœ… Full | `downloader.py` | MP3, M4A support | +| šŸ“ **Subtitles (Wistia)** | āœ… Full | `wistia_downloader.py` | Multi-language caption downloads | | šŸŽÆ **Quizzes** | āœ… Basic | `downloader.py` | Structure extraction | | šŸŽØ **Presentations** | āœ… Full | FFmpeg merge | Multi-slide processing | @@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor - **Resume Support** - Skip existing files, continue interrupted downloads - **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux - **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.) +- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages - **Comprehensive Logging** - Debug mode for troubleshooting ### šŸ›”ļø **Safety & Compliance** @@ -201,6 +203,7 @@ RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited) VALIDATE_DOWNLOADS=true # Enable file integrity validation RESUME_PARTIAL=true # Enable resume for partial downloads DEBUG=false # Enable debug logging +SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available # =============================================== # ADVANCED SETTINGS diff --git a/thinkific_downloader/__main__.py b/thinkific_downloader/__main__.py index ea48286..5d0a6d3 100644 --- a/thinkific_downloader/__main__.py +++ b/thinkific_downloader/__main__.py @@ -1,10 +1,123 @@ #!/usr/bin/env python3 """ -Command line entry point for Thinkific Downloader +Command line entry point for Thinkific Downloader and offline site generator. + +Usage examples: + python -m thinkific_downloader + python -m thinkific_downloader --json beginner-course.json + python -m thinkific_downloader generate-site beginner-course.json --clean """ +from __future__ import annotations + +import argparse import sys -from thinkific_downloader.downloader import main +from pathlib import Path +from typing import List, Optional + +from thinkific_downloader.downloader import main as downloader_main +from thinkific_downloader.site_generator import ( + SiteGenerationError, + generate_site, + load_course, +) + +# Note: keep console output lightweight so it mirrors existing downloader UX. + + +def _run_generate_site(argv: List[str]) -> int: + parser = argparse.ArgumentParser( + prog="thinkific_downloader generate-site", + description="Validate downloaded Thinkific course assets and build an offline viewer.", + ) + parser.add_argument( + "metadata", + help="Path to the course metadata JSON file (e.g., beginner-chess-mastery.json).", + ) + parser.add_argument( + "--downloads-dir", + dest="downloads_dir", + help="Override the downloads root directory (defaults to /../downloads).", + ) + parser.add_argument( + "--output-dir", + dest="output_dir", + help="Directory to write the generated site (defaults to downloads//).", + ) + parser.add_argument( + "--assets-dirname", + dest="assets_dirname", + default="site-assets", + help="Subdirectory name for bundled CSS/JS assets (default: site-assets).", + ) + parser.add_argument( + "--clean", + action="store_true", + help="Remove previously generated site files before rendering.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Validate metadata and assets without writing any files.", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="Suppress success output; errors will still be printed.", + ) + + args = parser.parse_args(argv) + + metadata_path = Path(args.metadata).expanduser() + downloads_dir: Optional[Path] = None + output_dir: Optional[Path] = None + + if args.downloads_dir: + downloads_dir = Path(args.downloads_dir).expanduser() + if args.output_dir: + output_dir = Path(args.output_dir).expanduser() + + try: + if args.dry_run: + load_course(metadata_path, downloads_root=downloads_dir) + if not args.quiet: + print("āœ… Course assets validated (dry run).") + return 0 + + generated_index = generate_site( + metadata_path, + downloads_root=downloads_dir, + output_dir=output_dir, + clean=args.clean, + assets_dirname=args.assets_dirname, + ) + if not args.quiet: + print(f"āœ… Offline course generated: {generated_index}") + return 0 + + except SiteGenerationError as exc: + print("āœ– Site generation failed:") + for error in exc.errors: + print(f" - {error}") + return 1 + except FileNotFoundError as exc: + print(f"āœ– {exc}") + return 1 + except Exception as exc: # pragma: no cover - unexpected edge cases + print(f"āœ– Unexpected error: {exc}") + return 1 + + +def main(argv: Optional[List[str]] = None) -> None: + argv = argv or sys.argv + if len(argv) > 1 and argv[1] in {"generate-site", "generate_site"}: + exit_code = _run_generate_site(argv[2:]) + sys.exit(exit_code) + + # Fallback to the legacy downloader behaviour. + downloader_main(argv) + if __name__ == "__main__": - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/thinkific_downloader/config.py b/thinkific_downloader/config.py index c1e1286..ab75fd4 100644 --- a/thinkific_downloader/config.py +++ b/thinkific_downloader/config.py @@ -37,6 +37,7 @@ class Settings: resume_partial: bool = True debug: bool = False course_name: str = "Course" + subtitle_download_enabled: bool = True @classmethod def from_env(cls): @@ -67,6 +68,7 @@ def from_env(cls): validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on') resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on') debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on') + subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on') # Clean cookie data to remove Unicode characters that cause encoding issues if cookie_data: @@ -101,5 +103,6 @@ def from_env(cls): download_delay=download_delay, validate_downloads=validate_downloads, resume_partial=resume_partial, - debug=debug + debug=debug, + subtitle_download_enabled=subtitle_download_enabled ) diff --git a/thinkific_downloader/downloader.py b/thinkific_downloader/downloader.py index f2c2191..4b93d69 100644 --- a/thinkific_downloader/downloader.py +++ b/thinkific_downloader/downloader.py @@ -382,10 +382,74 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1): add_download_task(src_url, dst_path, "file") +def _load_cached_progress(cache_file: Path): + """Return previously analyzed chapters and queued tasks from the resume cache.""" + analyzed_chapters = set() + saved_tasks: List[Dict[str, Any]] = [] + + if not cache_file.exists(): + return analyzed_chapters, saved_tasks + + try: + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + + analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) + saved_tasks = cache_data.get('download_tasks', []) + print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") + + # If subtitle downloads were newly enabled, invalidate cache so we can regenerate tasks. + if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks: + has_subtitle_tasks = any( + (task.get('content_type') or '').lower() == 'subtitle' + for task in saved_tasks + ) + if not has_subtitle_tasks: + print("šŸ†• Subtitle support enabled — refreshing cached analysis to include captions.") + analyzed_chapters = set() + saved_tasks = [] + try: + cache_file.unlink() + except OSError as exc: + print(f" āš ļø Warning: Failed to delete cache file for refresh: {exc}") + except (json.JSONDecodeError, OSError): + analyzed_chapters = set() + saved_tasks = [] + + return analyzed_chapters, saved_tasks + + +def _restore_saved_tasks(saved_tasks: List[Dict[str, Any]]): + """Restore cached download tasks, respecting the subtitle feature flag.""" + if not saved_tasks: + return + + restored_tasks = list(saved_tasks) + if SETTINGS and not SETTINGS.subtitle_download_enabled: + total_tasks = len(restored_tasks) + restored_tasks = [ + task for task in restored_tasks + if (task.get('content_type') or 'video').lower() != 'subtitle' + ] + skipped_count = total_tasks - len(restored_tasks) + if skipped_count > 0: + print(f"ā­ļø Skipping {skipped_count} cached subtitle task(s) because subtitle downloads are disabled.") + + if not restored_tasks: + return + + print(f"šŸ“„ Restoring {len(restored_tasks)} previously collected download tasks...") + for task_data in restored_tasks: + add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + + def init_course(data: Dict[str, Any]): """Initialize course structure and collect ALL download tasks first.""" global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS + + # Ensure settings/download manager are initialized so feature flags are available + init_settings() # Initialize download tasks list DOWNLOAD_TASKS = [] @@ -409,17 +473,7 @@ def init_course(data: Dict[str, Any]): analyzed_chapters = set() saved_tasks = [] - if cache_file.exists(): - try: - import json - with open(cache_file, 'r', encoding='utf-8') as f: - cache_data = json.load(f) - analyzed_chapters = set(cache_data.get('analyzed_chapters', [])) - saved_tasks = cache_data.get('download_tasks', []) - print(f"šŸ“‹ Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached") - except: - analyzed_chapters = set() - saved_tasks = [] + analyzed_chapters, saved_tasks = _load_cached_progress(cache_file) # Derive base host from landing_page_url if available landing = data['course'].get('landing_page_url') @@ -430,10 +484,7 @@ def init_course(data: Dict[str, Any]): print("\nšŸ” Phase 1: Analyzing course content and collecting download links...") # Restore saved download tasks - if saved_tasks: - print(f"šŸ“„ Restoring {len(saved_tasks)} previously collected download tasks...") - for task_data in saved_tasks: - add_download_task(task_data['url'], Path(task_data['dest_path']), task_data.get('content_type', 'video')) + _restore_saved_tasks(saved_tasks) collect_all_download_tasks(data, analyzed_chapters, cache_file) @@ -835,9 +886,24 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path): video_url = selected.get('url') if video_url: ext = '.mp4' # Default extension - resolved_name = filter_filename(file_name) + ext + resolved_name = filter_filename(file_name) + if not resolved_name.lower().endswith(ext): + resolved_name += ext print(f" šŸ“¹ Found video: {resolved_name}") add_download_task(video_url, dest_dir / resolved_name, "video") + try: + from .wistia_downloader import build_wistia_subtitle_tasks + subtitle_tasks = build_wistia_subtitle_tasks( + data.get('media') or {}, + dest_dir, + resolved_name, + SETTINGS, + ) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {Path(task['dest_path']).name}") + add_download_task(task['url'], Path(task['dest_path']), task.get('content_type', 'subtitle')) + except Exception as subtitle_error: + print(f" āš ļø Unable to queue subtitles for {resolved_name}: {subtitle_error}") except Exception as e: print(f" āŒ Failed to collect Wistia video {wistia_id}: {e}") @@ -1282,4 +1348,4 @@ def main(argv: List[str]): if __name__ == '__main__': - main(sys.argv) \ No newline at end of file + main(sys.argv) diff --git a/thinkific_downloader/site_generator.py b/thinkific_downloader/site_generator.py new file mode 100644 index 0000000..89cb5b9 --- /dev/null +++ b/thinkific_downloader/site_generator.py @@ -0,0 +1,753 @@ +""" +Utilities for turning a downloaded Thinkific course into an offline static website. + +This module currently focuses on: +1. Parsing course metadata JSON files and validating that the associated local assets + (videos, text lessons, attachments) exist. +2. Rendering a basic two-pane static site (HTML + CSS + JS stubs) that can be opened + directly from the filesystem. + +Further CLI plumbing and richer client-side behaviour will be added in subsequent steps. +""" + +from __future__ import annotations + +import base64 +import json +import re +import shutil +from dataclasses import dataclass, field +from datetime import datetime +from html import escape as html_escape +from pathlib import Path +from string import Template +from typing import Dict, Iterable, List, Optional, Sequence, Tuple +from urllib.parse import quote + +from .file_utils import filter_filename + +# File categorisation helpers +VIDEO_EXTENSIONS = {".mp4", ".m4v", ".mov", ".webm"} +CAPTION_EXTENSIONS = {".vtt", ".srt"} +TEXT_EXTENSIONS = {".html", ".htm"} +IGNORED_FILENAMES = {".ds_store"} + +LESSON_SUFFIX_PATTERN = re.compile(r"[._-](lesson|text)$", re.IGNORECASE) +NUMERIC_PREFIX_PATTERN = re.compile(r"^\d+\.?\s*") + + +class SiteGenerationError(Exception): + """Collects validation failures encountered during site generation.""" + + def __init__(self, errors: Sequence[str]): + self.errors = list(errors) + message = "Site generation encountered issues:\n" + "\n".join(f"- {err}" for err in self.errors) + super().__init__(message) + + +@dataclass +class LessonAssets: + """Represents the local files associated with a lesson.""" + + videos: List[Path] = field(default_factory=list) + captions: List[Path] = field(default_factory=list) + html_file: Optional[Path] = None + attachments: List[Path] = field(default_factory=list) + + +@dataclass +class Lesson: + """Course lesson enriched with local filesystem references.""" + + id: int + name: str + slug: str + position: int + chapter_id: int + lesson_type: str # "video" or "text" + display_name: str + duration_seconds: Optional[int] + description: Optional[str] + directory: Path + assets: LessonAssets = field(default_factory=LessonAssets) + + @property + def is_video(self) -> bool: + return self.lesson_type == "video" + + @property + def is_text(self) -> bool: + return self.lesson_type == "text" + + +@dataclass +class Chapter: + """A Thinkific chapter containing lessons.""" + + id: int + name: str + position: int + directory: Path + lessons: List[Lesson] = field(default_factory=list) + + +@dataclass +class Course: + """Top-level course representation ready for rendering.""" + + id: int + name: str + slug: str + output_dir: Path + metadata_path: Path + landing_page_url: Optional[str] + chapters: List[Chapter] = field(default_factory=list) + + def iter_lessons(self) -> Iterable[Lesson]: + for chapter in self.chapters: + yield from chapter.lessons + + @property + def first_lesson(self) -> Optional[Lesson]: + for chapter in self.chapters: + if chapter.lessons: + return chapter.lessons[0] + return None + + +def load_course(metadata_path: Path | str, downloads_root: Path | str | None = None) -> Course: + """ + Load course metadata and validate the presence of corresponding local assets. + + :param metadata_path: Path to the Thinkific course JSON dump. + :param downloads_root: Optional override for the downloads directory root. + :returns: Course model containing chapters, lessons, and asset references. + :raises SiteGenerationError: if required assets are missing or structure mismatches are detected. + """ + metadata_path = Path(metadata_path) + if downloads_root is None: + downloads_root = metadata_path.parent / "downloads" + downloads_root = Path(downloads_root) + + if not metadata_path.exists(): + raise FileNotFoundError(f"Metadata file not found: {metadata_path}") + + with metadata_path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + + course_info = data.get("course") or {} + course_slug = course_info.get("slug") + if not course_slug: + raise SiteGenerationError(["Course slug missing from metadata."]) + + course_dir = downloads_root / course_slug + + errors: List[str] = [] + if not course_dir.exists(): + errors.append(f"Course directory not found: {course_dir}") + + contents_map: Dict[int, Dict] = {content["id"]: content for content in data.get("contents", [])} + chapters: List[Chapter] = [] + + for chapter_data in sorted(data.get("chapters", []), key=lambda c: c.get("position", 0)): + chapter_id = chapter_data.get("id") + chapter_name = chapter_data.get("name", f"Chapter {chapter_id}") + chapter_position = chapter_data.get("position", 0) + chapter_dir_name = f"{chapter_position + 1}. {filter_filename(chapter_name)}" + chapter_dir = course_dir / chapter_dir_name + + if not chapter_dir.exists(): + errors.append( + f"Missing chapter directory for '{chapter_name}' (expected '{chapter_dir_name}')" + ) + continue + + chapter = Chapter( + id=chapter_id, + name=chapter_name, + position=chapter_position, + directory=chapter_dir, + lessons=[], + ) + + lesson_dirs = sorted( + [entry for entry in chapter_dir.iterdir() if entry.is_dir()], + key=lambda path: path.name.lower(), + ) + claimed_dirs: set[Path] = set() + + lesson_ids = chapter_data.get("content_ids", []) + lessons_for_chapter = [ + contents_map.get(lesson_id) for lesson_id in lesson_ids if contents_map.get(lesson_id) + ] + lessons_for_chapter.sort(key=lambda lesson: lesson.get("position", 0)) + + for index, lesson_data in enumerate(lessons_for_chapter): + content_type = lesson_data.get("contentable_type") + if content_type not in {"Lesson", "HtmlItem"}: + # Ignore unsupported content types (quizzes, surveys, etc.) for now. + continue + + lesson_name = lesson_data.get("name", f"Lesson {lesson_data.get('id')}") + lesson_kind = _classify_lesson_type(lesson_data) + + lesson_dir = _find_lesson_directory( + lesson_dirs=lesson_dirs, + claimed_dirs=claimed_dirs, + lesson_name=lesson_name, + lesson_index=index, + ) + + if lesson_dir is None: + errors.append( + f"Missing lesson directory for '{lesson_name}' in chapter '{chapter_name}'" + ) + continue + + assets = _scan_lesson_assets(lesson_dir) + + if lesson_kind == "video" and not assets.videos: + errors.append( + f"Video files not found for lesson '{lesson_name}' at {lesson_dir}" + ) + if lesson_kind == "text" and assets.html_file is None: + errors.append( + f"HTML content not found for text lesson '{lesson_name}' at {lesson_dir}" + ) + + duration_seconds = _extract_duration_seconds(lesson_data) + description = _extract_description(lesson_data) + + lesson = Lesson( + id=lesson_data.get("id"), + name=lesson_name, + slug=lesson_data.get("slug", filter_filename(lesson_name)), + position=lesson_data.get("position", index), + chapter_id=chapter_id, + lesson_type=lesson_kind, + display_name=lesson_data.get("display_name", lesson_kind.title()), + duration_seconds=duration_seconds, + description=description, + directory=lesson_dir, + assets=assets, + ) + chapter.lessons.append(lesson) + + chapters.append(chapter) + + if not chapters: + errors.append("No chapters discovered in metadata.") + + total_lessons = sum(len(chapter.lessons) for chapter in chapters) + if total_lessons == 0: + errors.append("No lessons were successfully mapped to local directories.") + + if errors: + raise SiteGenerationError(errors) + + return Course( + id=course_info.get("id"), + name=course_info.get("name", "Thinkific Course"), + slug=course_slug, + output_dir=course_dir, + metadata_path=metadata_path, + landing_page_url=course_info.get("landing_page_url"), + chapters=chapters, + ) + + +def generate_site( + metadata_path: Path | str, + downloads_root: Path | str | None = None, + output_dir: Path | str | None = None, + *, + clean: bool = False, + assets_dirname: str = "site-assets", +) -> Path: + """ + High-level helper that loads a course and renders the static site. + + :returns: Path to the generated index.html file. + """ + course = load_course(metadata_path, downloads_root=downloads_root) + target_dir = Path(output_dir) if output_dir else course.output_dir + target_dir.mkdir(parents=True, exist_ok=True) + _render_course( + course=course, + output_dir=target_dir, + clean=clean, + assets_dirname=assets_dirname, + ) + return target_dir / "index.html" + + +def _render_course(course: Course, output_dir: Path, *, clean: bool, assets_dirname: str) -> None: + """Render HTML/CSS/JS assets for a course.""" + templates_dir = Path(__file__).with_name("templates") + static_dir = Path(__file__).with_name("static") + + assets_dir = output_dir / assets_dirname + + if clean and assets_dir.exists(): + shutil.rmtree(assets_dir) + assets_dir.mkdir(parents=True, exist_ok=True) + + if clean: + index_path = output_dir / "index.html" + if index_path.exists(): + index_path.unlink() + + # Copy static assets + (assets_dir / "viewer.css").write_text( + (static_dir / "viewer.css").read_text(encoding="utf-8"), + encoding="utf-8", + ) + (assets_dir / "viewer.js").write_text( + (static_dir / "viewer.js").read_text(encoding="utf-8"), + encoding="utf-8", + ) + + # Prepare template fragments + base_template = Template((templates_dir / "base.html").read_text(encoding="utf-8")) + lesson_template = Template((templates_dir / "lesson.html").read_text(encoding="utf-8")) + + sidebar_html = _render_sidebar(course) + lesson_templates_html, initial_lesson_html = _render_lessons( + course=course, + lesson_template=lesson_template, + output_dir=output_dir, + ) + + course_payload = _build_course_payload(course) + + subtitle_html = "" + if course.landing_page_url: + subtitle_html = f'' + + index_html = base_template.substitute( + title=html_escape(course.name), + subtitle=subtitle_html, + sidebar=sidebar_html, + initial_lesson=initial_lesson_html, + lesson_templates=lesson_templates_html, + course_json=json.dumps(course_payload, ensure_ascii=False), + css_path=f"{assets_dirname}/viewer.css", + js_path=f"{assets_dirname}/viewer.js", + ) + + (output_dir / "index.html").write_text(index_html, encoding="utf-8") + + manifest = { + "generated_at": datetime.utcnow().isoformat(timespec="seconds") + "Z", + "course": { + "id": course.id, + "name": course.name, + "slug": course.slug, + }, + "files": [ + "index.html", + f"{assets_dirname}/viewer.css", + f"{assets_dirname}/viewer.js", + ], + "lessons": [ + { + "id": lesson.id, + "name": lesson.name, + "type": lesson.lesson_type, + "directory": str(lesson.directory.relative_to(output_dir)), + } + for lesson in course.iter_lessons() + ], + } + (assets_dir / "manifest.json").write_text( + json.dumps(manifest, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + +def _render_sidebar(course: Course) -> str: + """Create the sidebar navigation markup.""" + lines: List[str] = [ + '") + return "\n".join(lines) + + +def _render_lessons( + course: Course, + lesson_template: Template, + output_dir: Path, +) -> Tuple[str, str]: + """Render lesson templates and return (templates_html, initial_lesson_html).""" + templates: List[str] = ['") + return "\n".join(templates), initial_html + + +def _render_lesson(lesson: Lesson, template: Template, output_dir: Path) -> str: + """Render a single lesson section.""" + body_html = _render_lesson_body(lesson, output_dir) + attachments_html = _render_attachments(lesson, output_dir) + + meta_fragments: List[str] = [] + if lesson.duration_seconds: + meta_fragments.append( + f'{_format_duration(lesson.duration_seconds)}' + ) + if lesson.description: + meta_fragments.append( + f'

{html_escape(lesson.description)}

' + ) + lesson_meta = "" + if meta_fragments: + lesson_meta = '
' + "".join(meta_fragments) + "
" + + return template.substitute( + lesson_id=lesson.id, + lesson_type=lesson.lesson_type, + lesson_title=html_escape(lesson.name), + lesson_meta=lesson_meta, + lesson_body=body_html, + attachments=attachments_html, + ) + + +def _render_lesson_body(lesson: Lesson, output_dir: Path) -> str: + """Generate the primary lesson content markup.""" + if lesson.is_video and lesson.assets.videos: + video_sources = [] + for video_path in lesson.assets.videos: + rel_url = _relative_url(video_path, output_dir) + video_sources.append(f'') + + caption_tracks = [] + for idx, caption in enumerate(lesson.assets.captions): + srclang, label = _guess_caption_language(caption) + default_attr = " default" if idx == 0 else "" + caption_src = _build_caption_data_uri(caption) + caption_tracks.append( + f'' + ) + + return ( + '
' + '" + "
" + ) + + if lesson.is_text and lesson.assets.html_file: + html_content = lesson.assets.html_file.read_text(encoding="utf-8") + return f'
{html_content}
' + + return ( + '
' + "

This lesson type is not yet supported for offline viewing.

" + "
" + ) + + +def _render_attachments(lesson: Lesson, output_dir: Path) -> str: + """Render lesson attachment links, if any.""" + if not lesson.assets.attachments: + return "" + + items = [] + for attachment in lesson.assets.attachments: + rel_url = _relative_url(attachment, output_dir) + items.append( + f'
  • {html_escape(attachment.name)}
  • ' + ) + return ( + '
    ' + "

    Downloads

    " + "
      " + + "".join(items) + + "
    " + "
    " + ) + + +def _build_course_payload(course: Course) -> Dict: + """Build a lightweight JSON payload for client-side consumption.""" + payload = { + "id": course.id, + "name": course.name, + "slug": course.slug, + "chapters": [], + } + for chapter in course.chapters: + payload["chapters"].append( + { + "id": chapter.id, + "name": chapter.name, + "lessons": [ + { + "id": lesson.id, + "name": lesson.name, + "type": lesson.lesson_type, + } + for lesson in chapter.lessons + ], + } + ) + return payload + + +def _classify_lesson_type(lesson_data: Dict) -> str: + """Normalise lesson type labels from metadata.""" + label = ( + lesson_data.get("lesson_type_label") + or lesson_data.get("display_name") + or "" + ).lower() + content_type = (lesson_data.get("contentable_type") or "").lower() + if "video" in label: + return "video" + if "text" in label or "html" in label or content_type == "htmlitem": + return "text" + if content_type == "lesson": + return "video" + return "other" + + +def _find_lesson_directory( + lesson_dirs: List[Path], + claimed_dirs: set[Path], + lesson_name: str, + lesson_index: int, +) -> Optional[Path]: + """Find the best matching directory for a lesson by name and order.""" + target_key = _normalise_dir_key(lesson_name) + + # First pass: exact match on the normalised directory name. + for directory in lesson_dirs: + if directory in claimed_dirs: + continue + if _normalise_existing_dir(directory.name) == target_key: + claimed_dirs.add(directory) + return directory + + # Second pass: substring overlap. + for directory in lesson_dirs: + if directory in claimed_dirs: + continue + existing_key = _normalise_existing_dir(directory.name) + if target_key in existing_key or existing_key in target_key: + claimed_dirs.add(directory) + return directory + + # Fallback: choose by ordering to keep generation moving. + for directory in lesson_dirs: + if directory not in claimed_dirs: + claimed_dirs.add(directory) + return directory + + return None + + +def _normalise_existing_dir(name: str) -> str: + """Normalise an existing directory name down to its semantic slug.""" + name = NUMERIC_PREFIX_PATTERN.sub("", name) + name = LESSON_SUFFIX_PATTERN.sub("", name) + return filter_filename(name) + + +def _normalise_dir_key(name: str) -> str: + """Normalise metadata lesson names to align with directory naming conventions.""" + return filter_filename(name) + + +def _scan_lesson_assets(lesson_dir: Path) -> LessonAssets: + """Inspect a lesson directory and categorise its files.""" + videos: List[Path] = [] + captions: List[Path] = [] + html_files: List[Path] = [] + attachments: List[Path] = [] + + for file_path in sorted(lesson_dir.iterdir(), key=lambda p: p.name.lower()): + if not file_path.is_file(): + continue + if file_path.name.lower() in IGNORED_FILENAMES: + continue + + suffix = file_path.suffix.lower() + if suffix in VIDEO_EXTENSIONS: + videos.append(file_path) + continue + if suffix in CAPTION_EXTENSIONS: + captions.append(file_path) + continue + if suffix in TEXT_EXTENSIONS: + html_files.append(file_path) + continue + + attachments.append(file_path) + + primary_html = html_files[0] if html_files else None + # Treat additional HTML files as attachments to keep them accessible. + for extra_html in html_files[1:]: + attachments.append(extra_html) + + return LessonAssets( + videos=videos, + captions=captions, + html_file=primary_html, + attachments=attachments, + ) + + +def _relative_url(path: Path, base: Path) -> str: + """Convert an absolute path to a file:// friendly relative URL.""" + try: + relative_path = path.relative_to(base) + except ValueError: + relative_path = path + return quote(str(relative_path).replace("\\", "/")) + + +def _guess_caption_language(path: Path) -> Tuple[str, str]: + """Heuristically derive subtitle metadata from the filename.""" + stem = path.stem + if "." in stem: + lang = stem.split(".")[-1] + else: + lang = "en" + lang = lang.lower() + + canonical = _map_language_code(lang) + label = canonical.upper() + lang = canonical + + return lang, label + + +def _map_language_code(lang: str) -> str: + """Map common language fragments to two-letter ISO codes.""" + language_map = { + "eng": "en", + "english": "en", + "en-us": "en", + "en-gb": "en", + "es": "es", + "spa": "es", + "spanish": "es", + "fr": "fr", + "fre": "fr", + "fra": "fr", + "french": "fr", + "de": "de", + "ger": "de", + "deu": "de", + "german": "de", + "it": "it", + "ita": "it", + "italian": "it", + "pt": "pt", + "por": "pt", + "pt-br": "pt", + "pt-pt": "pt", + "portuguese": "pt", + "ru": "ru", + "rus": "ru", + "russian": "ru", + "zh": "zh", + "chi": "zh", + "zho": "zh", + "chinese": "zh", + } + + if lang in language_map: + return language_map[lang] + if len(lang) > 2: + return lang[:2] + if not lang: + return "en" + return lang + + +def _build_caption_data_uri(path: Path) -> str: + """Embed caption file content into a data URI to avoid file:// origin issues.""" + data = path.read_bytes() + encoded = base64.b64encode(data).decode("ascii") + return f"data:text/vtt;base64,{encoded}" + + +def _format_duration(seconds: int | float) -> str: + """Render a human-friendly duration string.""" + total_seconds = int(float(seconds)) + hours, remainder = divmod(total_seconds, 3600) + minutes, secs = divmod(remainder, 60) + if hours: + return f"{hours:d}:{minutes:02d}:{secs:02d}" + return f"{minutes:d}:{secs:02d}" + + +def _extract_duration_seconds(lesson_data: Dict) -> Optional[int]: + """Extract duration from metadata when available.""" + meta = lesson_data.get("meta_data") or {} + duration = meta.get("duration_in_seconds") + if duration is None: + return None + try: + return int(float(duration)) + except (TypeError, ValueError): + return None + + +def _extract_description(lesson_data: Dict) -> Optional[str]: + """Pull optional description fields from lesson metadata.""" + return ( + lesson_data.get("description") + or (lesson_data.get("meta_data") or {}).get("description") + or None + ) + + +__all__ = [ + "Course", + "Chapter", + "Lesson", + "LessonAssets", + "SiteGenerationError", + "generate_site", + "load_course", +] diff --git a/thinkific_downloader/static/viewer.css b/thinkific_downloader/static/viewer.css new file mode 100644 index 0000000..85aefec --- /dev/null +++ b/thinkific_downloader/static/viewer.css @@ -0,0 +1,277 @@ +:root { + color-scheme: light; + font-family: "Inter", "Segoe UI", -apple-system, BlinkMacSystemFont, "Helvetica Neue", Arial, sans-serif; + --sidebar-bg: #0f172a; + --sidebar-text: #e2e8f0; + --sidebar-accent: #38bdf8; + --content-bg: #f8fafc; + --content-text: #0f172a; + --divider: rgba(15, 23, 42, 0.12); +} + +*, +*::before, +*::after { + box-sizing: border-box; +} + +body { + margin: 0; + background: var(--content-bg); + color: var(--content-text); + line-height: 1.6; +} + +a { + color: var(--sidebar-accent); + text-decoration: none; +} + +a:hover, +a:focus { + text-decoration: underline; +} + +.viewer { + display: flex; + min-height: 100vh; + width: 100%; +} + +.sidebar { + width: 320px; + background: var(--sidebar-bg); + color: var(--sidebar-text); + padding: 24px; + display: flex; + flex-direction: column; + gap: 24px; +} + +.course-header { + display: flex; + flex-direction: column; + gap: 8px; +} + +.course-title { + margin: 0; + font-size: 1.5rem; + font-weight: 600; + letter-spacing: 0.01em; +} + +.course-link { + margin: 0; + font-size: 0.85rem; + color: rgba(226, 232, 240, 0.72); + word-break: break-word; +} + +.sidebar-nav { + flex: 1 1 auto; + overflow-y: auto; + padding-right: 4px; +} + +.sidebar-chapter + .sidebar-chapter { + margin-top: 24px; +} + +.chapter-title { + margin: 0 0 8px; + font-size: 0.95rem; + font-weight: 600; + text-transform: uppercase; + letter-spacing: 0.08em; + color: rgba(226, 232, 240, 0.8); +} + +.lesson-list { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + gap: 4px; +} + +.lesson-link { + width: 100%; + border: 0; + background: transparent; + color: inherit; + text-align: left; + padding: 8px 12px; + border-radius: 6px; + font-size: 0.95rem; + line-height: 1.4; + cursor: pointer; + transition: background 0.12s ease, color 0.12s ease; +} + +.lesson-link:hover, +.lesson-link:focus { + outline: none; + background: rgba(148, 163, 184, 0.18); + color: #fff; +} + +.lesson-link.is-active { + background: rgba(56, 189, 248, 0.16); + color: #fff; +} + +.content-pane { + flex: 1 1 auto; + padding: 32px 48px; + overflow-y: auto; + background: var(--content-bg); + color: var(--content-text); +} + +.lesson { + max-width: 900px; + margin: 0 auto; + display: flex; + flex-direction: column; + gap: 24px; +} + +.lesson-header { + border-bottom: 1px solid var(--divider); + padding-bottom: 12px; +} + +.lesson-title { + margin: 0 0 8px; + font-size: 1.75rem; + font-weight: 600; + line-height: 1.3; +} + +.lesson-meta { + display: flex; + flex-wrap: wrap; + gap: 8px 16px; + font-size: 0.9rem; + color: rgba(15, 23, 42, 0.72); +} + +.lesson-duration { + display: inline-flex; + align-items: center; + gap: 6px; +} + +.lesson-duration::before { + content: "ā±"; + font-size: 0.85rem; +} + +.lesson-description { + margin: 0; +} + +.lesson-content { + display: flex; + flex-direction: column; + gap: 24px; +} + +.video-wrapper { + background: #000; + border-radius: 12px; + overflow: hidden; + box-shadow: 0 18px 40px rgba(15, 23, 42, 0.18); +} + +.lesson-video { + width: 100%; + height: auto; + display: block; + background: #000; +} + +.lesson-article { + font-size: 1.05rem; + line-height: 1.7; + color: inherit; +} + +.lesson-article img, +.lesson-article video, +.lesson-article iframe { + max-width: 100%; + height: auto; +} + +.lesson-article h1, +.lesson-article h2, +.lesson-article h3, +.lesson-article h4, +.lesson-article h5, +.lesson-article h6 { + color: var(--content-text); + margin-top: 1.6em; +} + +.lesson-article p { + margin: 1em 0; +} + +.lesson-attachments { + border-top: 1px solid var(--divider); + padding-top: 16px; +} + +.lesson-attachments h3 { + margin: 0 0 8px; + font-size: 1rem; + text-transform: uppercase; + letter-spacing: 0.08em; +} + +.lesson-attachments ul { + list-style: none; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + gap: 6px; +} + +.attachment-link { + display: inline-flex; + align-items: center; + gap: 8px; + font-size: 0.95rem; +} + +.attachment-link::before { + content: "⬇"; + font-size: 0.85rem; +} + +.lesson-unavailable { + padding: 16px; + border-radius: 8px; + background: rgba(56, 189, 248, 0.15); +} + +@media (max-width: 960px) { + .viewer { + flex-direction: column; + } + + .sidebar { + width: 100%; + max-height: 320px; + overflow-y: auto; + border-bottom: 1px solid rgba(148, 163, 184, 0.24); + } + + .content-pane { + padding: 24px 20px; + } +} diff --git a/thinkific_downloader/static/viewer.js b/thinkific_downloader/static/viewer.js new file mode 100644 index 0000000..97620d5 --- /dev/null +++ b/thinkific_downloader/static/viewer.js @@ -0,0 +1,183 @@ +(function () { + var LESSON_HASH_PREFIX = "lesson-"; + + function onReady(fn) { + if (document.readyState === "loading") { + document.addEventListener("DOMContentLoaded", fn, { once: true }); + } else { + fn(); + } + } + + function initViewer() { + var container = document.getElementById("lesson-container"); + var sidebar = document.querySelector(".sidebar-nav"); + var templatesRoot = document.getElementById("lesson-templates"); + + if (!container || !sidebar || !templatesRoot) { + return; + } + + container.setAttribute("data-viewer-initialised", "true"); + + var lessonButtons = Array.prototype.slice.call( + sidebar.querySelectorAll(".lesson-link") + ); + if (!lessonButtons.length) { + return; + } + + var state = { + currentLessonId: null, + }; + + function getTemplateForLesson(lessonId) { + return document.getElementById("lesson-template-" + lessonId); + } + + function getHashLessonId(hash) { + if (!hash) return null; + var value = hash.charAt(0) === "#" ? hash.slice(1) : hash; + if (value.indexOf(LESSON_HASH_PREFIX) !== 0) return null; + return value.slice(LESSON_HASH_PREFIX.length); + } + + function updateHash(lessonId, fromHash) { + if (fromHash) return; + var targetHash = "#" + LESSON_HASH_PREFIX + lessonId; + if (window.history && window.history.replaceState) { + window.history.replaceState(null, "", targetHash); + } else { + window.location.hash = targetHash; + } + } + + function pauseActiveMedia() { + var media = container.querySelectorAll("video, audio"); + for (var i = 0; i < media.length; i += 1) { + try { + media[i].pause(); + media[i].currentTime = 0; + } catch (err) { + /* noop */ + } + } + } + + function setActiveButton(lessonId) { + lessonButtons.forEach(function (button) { + var isActive = button.dataset.lessonId === String(lessonId); + button.classList.toggle("is-active", isActive); + if (isActive) { + button.setAttribute("aria-current", "page"); + } else { + button.removeAttribute("aria-current"); + } + }); + } + + function focusLessonContent() { + container.focus({ preventScroll: true }); + container.scrollTop = 0; + var heading = container.querySelector(".lesson-title"); + if (heading) { + heading.setAttribute("tabindex", "-1"); + heading.focus({ preventScroll: true }); + heading.removeAttribute("tabindex"); + } + } + + function renderLesson(lessonId, options) { + if (!lessonId) return; + if (!options || !options.force) { + if (state.currentLessonId === lessonId) return; + } + + var template = getTemplateForLesson(lessonId); + if (!template || !("content" in template)) { + console.warn("Missing template for lesson", lessonId); + return; + } + + pauseActiveMedia(); + container.innerHTML = ""; + container.appendChild(template.content.cloneNode(true)); + + state.currentLessonId = lessonId; + setActiveButton(lessonId); + focusLessonContent(); + updateHash(lessonId, options && options.fromHash); + } + + function handleLessonClick(event) { + var lessonId = event.currentTarget.dataset.lessonId; + renderLesson(lessonId); + } + + function handleLessonKeydown(event) { + var key = event.key; + var currentIndex = lessonButtons.indexOf(event.currentTarget); + if (key === "ArrowDown") { + event.preventDefault(); + var next = lessonButtons[currentIndex + 1] || lessonButtons[0]; + next.focus(); + return; + } + if (key === "ArrowUp") { + event.preventDefault(); + var prev = + lessonButtons[currentIndex - 1] || + lessonButtons[lessonButtons.length - 1]; + prev.focus(); + return; + } + if (key === "Home") { + event.preventDefault(); + lessonButtons[0].focus(); + return; + } + if (key === "End") { + event.preventDefault(); + lessonButtons[lessonButtons.length - 1].focus(); + return; + } + if (key === " " || key === "Enter") { + event.preventDefault(); + var lessonId = event.currentTarget.dataset.lessonId; + renderLesson(lessonId); + } + } + + function bindEvents() { + lessonButtons.forEach(function (button) { + button.addEventListener("click", handleLessonClick); + button.addEventListener("keydown", handleLessonKeydown); + }); + + window.addEventListener("hashchange", function () { + var lessonId = getHashLessonId(window.location.hash); + if (lessonId) { + renderLesson(lessonId, { fromHash: true, force: true }); + } + }); + } + + function initInitialLesson() { + var hashLessonId = getHashLessonId(window.location.hash); + var activeButton = lessonButtons[0]; + for (var i = 0; i < lessonButtons.length; i += 1) { + if (lessonButtons[i].classList.contains("is-active")) { + activeButton = lessonButtons[i]; + break; + } + } + var initialId = hashLessonId || (activeButton && activeButton.dataset.lessonId); + renderLesson(initialId, { fromHash: true, force: true }); + } + + bindEvents(); + initInitialLesson(); + } + + onReady(initViewer); +})(); diff --git a/thinkific_downloader/templates/base.html b/thinkific_downloader/templates/base.html new file mode 100644 index 0000000..4986e9d --- /dev/null +++ b/thinkific_downloader/templates/base.html @@ -0,0 +1,28 @@ + + + + + $title + + + + +
    + +
    + $initial_lesson +
    +
    + $lesson_templates + + + + diff --git a/thinkific_downloader/templates/lesson.html b/thinkific_downloader/templates/lesson.html new file mode 100644 index 0000000..b0a121f --- /dev/null +++ b/thinkific_downloader/templates/lesson.html @@ -0,0 +1,10 @@ +
    +
    +

    $lesson_title

    + $lesson_meta +
    +
    + $lesson_body +
    + $attachments +
    diff --git a/thinkific_downloader/wistia_downloader.py b/thinkific_downloader/wistia_downloader.py index f53ae65..6421f86 100644 --- a/thinkific_downloader/wistia_downloader.py +++ b/thinkific_downloader/wistia_downloader.py @@ -1,12 +1,14 @@ import json +import os import re -import requests import zlib -from typing import Optional, List from pathlib import Path -import os +from typing import Any, Dict, Iterable, List, Optional +from urllib.parse import urlparse + +import requests + from .file_utils import filter_filename -from .download_manager import DownloadManager # Local imports inside functions to avoid circular dependency during module import # Handles video proxy and wistia direct downloads @@ -14,6 +16,227 @@ WISTIA_JSON_URL = "https://fast.wistia.com/embed/medias/{id}.json" VIDEO_PROXY_JSONP_ID_PATTERN = re.compile(r"medias/(\w+)\.jsonp") +DEFAULT_SUBTITLE_EXTENSION = "vtt" +_LANGUAGE_SANITIZE_PATTERN = re.compile(r'[^A-Za-z0-9\-]+') + + +def _normalize_wistia_track_url(url: Optional[str]) -> Optional[str]: + """Normalize Wistia caption track URLs to absolute HTTPS URLs.""" + if not url or not isinstance(url, str): + return None + + normalized = url.strip() + if not normalized: + return None + + if normalized.startswith('//'): + normalized = f"https:{normalized}" + elif normalized.startswith('/'): + normalized = f"https://fast.wistia.com{normalized}" + elif not re.match(r'^https?://', normalized, re.IGNORECASE): + normalized = f"https://fast.wistia.com/{normalized.lstrip('/')}" + + return normalized + + +def _build_caption_url(hashed_id: Optional[str], language: Optional[str], extension: Optional[str] = None) -> Optional[str]: + """Construct a Wistia caption URL when only hashedId and language are available.""" + if not hashed_id or not language: + return None + + ext = (extension or DEFAULT_SUBTITLE_EXTENSION).lstrip('.') or DEFAULT_SUBTITLE_EXTENSION + return f"https://fast.wistia.com/embed/captions/{hashed_id}.{ext}?language={language}" + + +def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) -> str: + """Infer file extension from track URL.""" + try: + parsed = urlparse(url) + suffix = Path(parsed.path).suffix + if suffix: + return suffix.lstrip('.').lower() or fallback + except (AttributeError, TypeError): + pass + return fallback + + +def extract_wistia_subtitle_tracks(media: Dict[str, Any]) -> List[Dict[str, Optional[str]]]: + """Extract subtitle/caption track metadata from Wistia media JSON.""" + if not isinstance(media, dict): + return [] + + hashed_id = media.get('hashedId') or media.get('hashed_id') + tracks: List[Dict[str, Optional[str]]] = [] + + def add_track(url: Optional[str], language: Optional[str], label: Optional[str], ext: Optional[str]): + normalized = _normalize_wistia_track_url(url) + if not normalized and hashed_id and language: + normalized = _build_caption_url(hashed_id, language, ext) + if not normalized: + return + tracks.append({ + 'url': normalized, + 'language': language, + 'label': label, + 'ext': (ext or '').lstrip('.') or None + }) + + def collect_from_captions(caption_items: Optional[Iterable[Dict[str, Any]]]): + for track in caption_items or []: + if not isinstance(track, dict): + continue + add_track( + track.get('url') or track.get('src'), + track.get('language') or track.get('lang'), + track.get('languageName') or track.get('label') or track.get('name'), + track.get('ext') + ) + + def collect_from_text_tracks(track_items: Optional[Iterable[Dict[str, Any]]], label_keys: Iterable[str]): + label_key_order = tuple(label_keys) + for track in track_items or []: + if not isinstance(track, dict): + continue + language = track.get('language') or track.get('lang') + label = next((track.get(key) for key in label_key_order if track.get(key)), None) + sources = track.get('sources') or [] + if sources: + for source in sources: + if not isinstance(source, dict): + continue + add_track( + source.get('url') or source.get('src'), + language, + label, + source.get('ext') or track.get('ext') + ) + else: + add_track( + track.get('url') or track.get('src'), + language, + label, + track.get('ext') + ) + + def collect_from_assets(asset_items: Optional[Iterable[Dict[str, Any]]]): + subtitle_flags = {'caption', 'captions', 'subtitle', 'subtitles'} + for asset in asset_items or []: + if not isinstance(asset, dict): + continue + asset_type = (asset.get('type') or '').lower() + asset_kind = (asset.get('kind') or '').lower() + if asset_type in subtitle_flags or asset_kind in subtitle_flags: + add_track( + asset.get('url') or asset.get('src'), + asset.get('language') or asset.get('lang'), + asset.get('display_name') or asset.get('name'), + asset.get('ext') + ) + + def collect_from_transcripts(transcripts: Optional[Iterable[Dict[str, Any]]]): + if not hashed_id: + return + for transcript in transcripts or []: + if not isinstance(transcript, dict) or not transcript.get('hasCaptions'): + continue + language = ( + transcript.get('language') + or transcript.get('wistiaLanguageCode') + or transcript.get('bcp47LanguageTag') + ) + if not language: + continue + add_track( + _build_caption_url(hashed_id, language, DEFAULT_SUBTITLE_EXTENSION), + language, + transcript.get('name') or transcript.get('familyName') or language, + DEFAULT_SUBTITLE_EXTENSION + ) + + collect_from_captions(media.get('captions')) + collect_from_text_tracks(media.get('text_tracks'), ('name', 'label')) + collect_from_text_tracks(media.get('textTracks'), ('name', 'label', 'title')) + collect_from_assets(media.get('assets')) + collect_from_transcripts(media.get('availableTranscripts')) + + unique_tracks: Dict[str, Dict[str, Optional[str]]] = {} + for track in tracks: + url = track['url'] + if not url: + continue + if url not in unique_tracks: + unique_tracks[url] = track + else: + existing = unique_tracks[url] + # Prefer track data that includes language/label/ext + if not existing.get('language') and track.get('language'): + existing['language'] = track['language'] + if not existing.get('label') and track.get('label'): + existing['label'] = track['label'] + if not existing.get('ext') and track.get('ext'): + existing['ext'] = track['ext'] + + return list(unique_tracks.values()) + + +def build_wistia_subtitle_tasks( + media: Dict[str, Any], + dest_dir: Path, + video_base_name: str, + settings: Optional[Any] = None, +) -> List[Dict[str, Any]]: + """Construct subtitle download task dicts for a Wistia media object.""" + if not isinstance(dest_dir, Path): + dest_dir = Path(dest_dir) + + if settings and not getattr(settings, 'subtitle_download_enabled', True): + return [] + + tracks = extract_wistia_subtitle_tracks(media) + if not tracks: + return [] + + base_name = Path(video_base_name).stem + if not base_name: + fallback_name = media.get('name') or media.get('hashedId') or 'captions' + base_name = filter_filename(str(fallback_name)) + else: + base_name = filter_filename(base_name) + + if not base_name: + base_name = "captions" + + tasks: List[Dict[str, Any]] = [] + counter = 1 + for track in tracks: + url = track.get('url') + if not url: + continue + + ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION + language_raw = track.get('language') or track.get('label') + if isinstance(language_raw, str): + language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_raw).strip('-') + else: + language_part = '' + + if not language_part: + language_part = 'captions' if counter == 1 else f"captions-{counter}" + + subtitle_filename = filter_filename(f"{base_name}.{language_part}.{ext}") + if not subtitle_filename: + subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}") + + tasks.append({ + 'url': url, + 'dest_path': dest_dir / subtitle_filename, + 'content_type': 'subtitle', + 'label': track.get('label'), + 'language': track.get('language'), + }) + counter += 1 + + return tasks def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"): @@ -143,6 +366,7 @@ def infer_ext(asset: dict) -> str: return '.mp4' resolved_base = filter_filename(file_name if file_name else media.get('name') or wistia_id) + current_dir = Path.cwd() if all_formats_flag: print(f"Downloading all available Wistia assets for {resolved_base}") @@ -172,6 +396,11 @@ def infer_ext(asset: dict) -> str: DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name))) else: print("Download manager not initialized") + from .downloader import SETTINGS, add_download_task + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_base, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle')) return # Single quality path @@ -199,7 +428,10 @@ def infer_ext(asset: dict) -> str: print(f"URL : {video_url}\nFile Name : {resolved_name}") # Queue video for parallel download with absolute path to current directory - from .downloader import add_download_task - current_dir = Path.cwd() # Capture current working directory + from .downloader import SETTINGS, add_download_task full_path = current_dir / resolved_name # Create absolute path add_download_task(video_url, full_path, "video") + subtitle_tasks = build_wistia_subtitle_tasks(media, current_dir, resolved_name, SETTINGS) + for task in subtitle_tasks: + print(f" [Subs] Queued subtitles: {task['dest_path'].name}") + add_download_task(task['url'], task['dest_path'], task.get('content_type', 'subtitle'))