Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ RESUME_PARTIAL=true
# Enable detailed logging for troubleshooting
DEBUG=false

# Download subtitles/captions when available (default: true)
SUBTITLE_DOWNLOAD_ENABLED=true

# ===============================================
# ADVANCED SETTINGS
# ===============================================
Expand All @@ -83,4 +86,4 @@ COURSE_DATA_FILE=""
# ALL_VIDEO_FORMATS=false

# Log level (DEBUG, INFO, WARNING, ERROR)
# LOG_LEVEL="INFO"
# LOG_LEVEL="INFO"
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
| 📄 **HTML Content** | ✅ Full | `downloader.py` | Clean extraction, formatting |
| 📚 **PDF Documents** | ✅ Full | `downloader.py` | Direct download, validation |
| 🎵 **Audio Files** | ✅ Full | `downloader.py` | MP3, M4A support |
| 📝 **Subtitles (Wistia)** | ✅ Full | `wistia_downloader.py` | Multi-language caption downloads |
| 🎯 **Quizzes** | ✅ Basic | `downloader.py` | Structure extraction |
| 🎨 **Presentations** | ✅ Full | FFmpeg merge | Multi-slide processing |

Expand All @@ -70,6 +71,7 @@ A modern, feature-rich Python utility to download courses from Thinkific platfor
- **Resume Support** - Skip existing files, continue interrupted downloads
- **Atomic Resume/Backup** - Status file is always safely backed up and updated, works on Windows, Mac, Linux
- **Multiple Quality Options** - Choose video quality (720p, 1080p, etc.)
- **Subtitle Downloads** - Automatically grab Wistia caption tracks in multiple languages
- **Comprehensive Logging** - Debug mode for troubleshooting

### 🛡️ **Safety & Compliance**
Expand Down Expand Up @@ -201,6 +203,7 @@ RATE_LIMIT_MB_S= # Rate limit in MB/s (empty = unlimited)
VALIDATE_DOWNLOADS=true # Enable file integrity validation
RESUME_PARTIAL=true # Enable resume for partial downloads
DEBUG=false # Enable debug logging
SUBTITLE_DOWNLOAD_ENABLED=true # Download subtitles/captions when available

# ===============================================
# ADVANCED SETTINGS
Expand Down
5 changes: 4 additions & 1 deletion thinkific_downloader/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class Settings:
resume_partial: bool = True
debug: bool = False
course_name: str = "Course"
subtitle_download_enabled: bool = True

@classmethod
def from_env(cls):
Expand Down Expand Up @@ -67,6 +68,7 @@ def from_env(cls):
validate_downloads = os.getenv('VALIDATE_DOWNLOADS', 'true').lower() in ('1', 'true', 'yes', 'on')
resume_partial = os.getenv('RESUME_PARTIAL', 'true').lower() in ('1', 'true', 'yes', 'on')
debug = os.getenv('DEBUG', 'false').lower() in ('1', 'true', 'yes', 'on')
subtitle_download_enabled = os.getenv('SUBTITLE_DOWNLOAD_ENABLED', 'true').lower() in ('1', 'true', 'yes', 'on')

# Clean cookie data to remove Unicode characters that cause encoding issues
if cookie_data:
Expand Down Expand Up @@ -101,5 +103,6 @@ def from_env(cls):
download_delay=download_delay,
validate_downloads=validate_downloads,
resume_partial=resume_partial,
debug=debug
debug=debug,
subtitle_download_enabled=subtitle_download_enabled
)
29 changes: 27 additions & 2 deletions thinkific_downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,9 @@ def download_file_chunked(src_url: str, dst_name: str, chunk_mb: int = 1):
def init_course(data: Dict[str, Any]):
"""Initialize course structure and collect ALL download tasks first."""
global COURSE_CONTENTS, ROOT_PROJECT_DIR, BASE_HOST, DOWNLOAD_TASKS

# Ensure settings/download manager are initialized so feature flags are available
init_settings()

# Initialize download tasks list
DOWNLOAD_TASKS = []
Expand Down Expand Up @@ -417,6 +420,21 @@ def init_course(data: Dict[str, Any]):
analyzed_chapters = set(cache_data.get('analyzed_chapters', []))
saved_tasks = cache_data.get('download_tasks', [])
print(f"📋 Found previous progress: {len(analyzed_chapters)} chapters analyzed, {len(saved_tasks)} tasks cached")
# If subtitle downloads are enabled but cached tasks do not contain subtitles,
# treat cache as outdated so we can regenerate tasks with captions.
if SETTINGS and SETTINGS.subtitle_download_enabled and saved_tasks:
has_subtitle_tasks = any(
(task.get('content_type') or '').lower() == 'subtitle'
for task in saved_tasks
)
if not has_subtitle_tasks:
print("🆕 Subtitle support enabled — refreshing cached analysis to include captions.")
analyzed_chapters = set()
saved_tasks = []
try:
cache_file.unlink()
except Exception:
pass
except:
analyzed_chapters = set()
saved_tasks = []
Expand Down Expand Up @@ -835,9 +853,16 @@ def collect_video_task_wistia(wistia_id: str, file_name: str, dest_dir: Path):
video_url = selected.get('url')
if video_url:
ext = '.mp4' # Default extension
resolved_name = filter_filename(file_name) + ext
resolved_name = filter_filename(file_name)
if not resolved_name.lower().endswith(ext):
resolved_name += ext
print(f" 📹 Found video: {resolved_name}")
add_download_task(video_url, dest_dir / resolved_name, "video")
try:
from .wistia_downloader import queue_wistia_subtitle_downloads
queue_wistia_subtitle_downloads(data.get('media') or {}, dest_dir, resolved_name)
except Exception as subtitle_error:
print(f" ⚠️ Unable to queue subtitles for {resolved_name}: {subtitle_error}")
except Exception as e:
print(f" ❌ Failed to collect Wistia video {wistia_id}: {e}")

Expand Down Expand Up @@ -1282,4 +1307,4 @@ def main(argv: List[str]):


if __name__ == '__main__':
main(sys.argv)
main(sys.argv)
226 changes: 221 additions & 5 deletions thinkific_downloader/wistia_downloader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,233 @@
import json
import os
import re
import requests
import zlib
from typing import Optional, List
from pathlib import Path
import os
from typing import Any, Dict, List, Optional
from urllib.parse import urlparse

import requests

from .file_utils import filter_filename
from .download_manager import DownloadManager
# Local imports inside functions to avoid circular dependency during module import

# Handles video proxy and wistia direct downloads

WISTIA_JSON_URL = "https://fast.wistia.com/embed/medias/{id}.json"

VIDEO_PROXY_JSONP_ID_PATTERN = re.compile(r"medias/(\w+)\.jsonp")
DEFAULT_SUBTITLE_EXTENSION = "vtt"
_LANGUAGE_SANITIZE_PATTERN = re.compile(r'[^A-Za-z0-9\-]+')


def _normalize_wistia_track_url(url: Optional[str]) -> Optional[str]:
"""Normalize Wistia caption track URLs to absolute HTTPS URLs."""
if not url or not isinstance(url, str):
return None

normalized = url.strip()
if not normalized:
return None

if normalized.startswith('//'):
normalized = f"https:{normalized}"
elif normalized.startswith('/'):
normalized = f"https://fast.wistia.com{normalized}"
elif not re.match(r'^https?://', normalized, re.IGNORECASE):
normalized = f"https://fast.wistia.com/{normalized.lstrip('/')}"

return normalized


def _build_caption_url(hashed_id: Optional[str], language: Optional[str], extension: Optional[str] = None) -> Optional[str]:
"""Construct a Wistia caption URL when only hashedId and language are available."""
if not hashed_id or not language:
return None

ext = (extension or DEFAULT_SUBTITLE_EXTENSION).lstrip('.') or DEFAULT_SUBTITLE_EXTENSION
return f"https://fast.wistia.com/embed/captions/{hashed_id}.{ext}?language={language}"


def _infer_track_extension(url: str, fallback: str = DEFAULT_SUBTITLE_EXTENSION) -> str:
"""Infer file extension from track URL."""
try:
parsed = urlparse(url)
suffix = Path(parsed.path).suffix
if suffix:
return suffix.lstrip('.').lower() or fallback
except Exception:
pass
return fallback


def extract_wistia_subtitle_tracks(media: Dict[str, Any]) -> List[Dict[str, Optional[str]]]:
"""Extract subtitle/caption track metadata from Wistia media JSON."""
if not isinstance(media, dict):
return []

hashed_id = media.get('hashedId') or media.get('hashed_id')
tracks: List[Dict[str, Optional[str]]] = []

def add_track(url: Optional[str], language: Optional[str], label: Optional[str], ext: Optional[str]):
normalized = _normalize_wistia_track_url(url)
if not normalized and hashed_id and language:
normalized = _build_caption_url(hashed_id, language, ext)
if not normalized:
return
tracks.append({
'url': normalized,
'language': language,
'label': label,
'ext': (ext or '').lstrip('.') or None
})

for track in media.get('captions') or []:
if isinstance(track, dict):
add_track(
track.get('url') or track.get('src'),
track.get('language') or track.get('lang'),
track.get('languageName') or track.get('label') or track.get('name'),
track.get('ext')
)

for track in media.get('text_tracks') or []:
if not isinstance(track, dict):
continue
sources = track.get('sources') or []
if sources:
for source in sources:
if isinstance(source, dict):
add_track(
source.get('url') or source.get('src'),
track.get('language') or track.get('lang'),
track.get('name') or track.get('label'),
source.get('ext') or track.get('ext')
)
else:
add_track(
track.get('url') or track.get('src'),
track.get('language') or track.get('lang'),
track.get('name') or track.get('label'),
track.get('ext')
)

for track in media.get('textTracks') or []:
if not isinstance(track, dict):
continue
sources = track.get('sources') or []
if sources:
for source in sources:
if isinstance(source, dict):
add_track(
source.get('url') or source.get('src'),
track.get('language') or track.get('lang'),
track.get('name') or track.get('label') or track.get('title'),
source.get('ext') or track.get('ext')
)
else:
add_track(
track.get('url') or track.get('src'),
track.get('language') or track.get('lang'),
track.get('name') or track.get('label') or track.get('title'),
track.get('ext')
)

for asset in media.get('assets') or []:
if isinstance(asset, dict):
asset_type = (asset.get('type') or '').lower()
asset_kind = (asset.get('kind') or '').lower()
if asset_type in ('caption', 'captions', 'subtitle', 'subtitles') or asset_kind in ('caption', 'captions', 'subtitle', 'subtitles'):
add_track(
asset.get('url') or asset.get('src'),
asset.get('language') or asset.get('lang'),
asset.get('display_name') or asset.get('name'),
asset.get('ext')
)

available_transcripts = media.get('availableTranscripts') or []
if hashed_id and available_transcripts:
for transcript in available_transcripts:
if not isinstance(transcript, dict) or not transcript.get('hasCaptions'):
continue
language = transcript.get('language') or transcript.get('wistiaLanguageCode') or transcript.get('bcp47LanguageTag')
if not language:
continue
add_track(
_build_caption_url(hashed_id, language, DEFAULT_SUBTITLE_EXTENSION),
language,
transcript.get('name') or transcript.get('familyName') or language,
DEFAULT_SUBTITLE_EXTENSION
)

unique_tracks: Dict[str, Dict[str, Optional[str]]] = {}
for track in tracks:
url = track['url']
if not url:
continue
if url not in unique_tracks:
unique_tracks[url] = track
else:
existing = unique_tracks[url]
# Prefer track data that includes language/label/ext
if not existing.get('language') and track.get('language'):
existing['language'] = track['language']
if not existing.get('label') and track.get('label'):
existing['label'] = track['label']
if not existing.get('ext') and track.get('ext'):
existing['ext'] = track['ext']

return list(unique_tracks.values())


def queue_wistia_subtitle_downloads(media: Dict[str, Any], dest_dir: Path, video_base_name: str):
"""Queue subtitle download tasks for a Wistia media object."""
from .downloader import SETTINGS, add_download_task, init_settings

if not isinstance(dest_dir, Path):
dest_dir = Path(dest_dir)

init_settings()
settings = SETTINGS
if settings and hasattr(settings, 'subtitle_download_enabled') and not settings.subtitle_download_enabled:
return

tracks = extract_wistia_subtitle_tracks(media)
if not tracks:
return

base_name = Path(video_base_name).stem
if not base_name:
fallback_name = media.get('name') or media.get('hashedId') or 'captions'
base_name = filter_filename(str(fallback_name))
else:
base_name = filter_filename(base_name)

if not base_name:
base_name = "captions"

counter = 1
for track in tracks:
url = track.get('url')
if not url:
continue

ext = (track.get('ext') or _infer_track_extension(url)).lstrip('.').lower() or DEFAULT_SUBTITLE_EXTENSION
language_part = track.get('language') or track.get('label') or ''
if isinstance(language_part, (list, dict)):
language_part = ''
language_part = str(language_part or '')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This block of code to safely get and convert language_part to a string can be simplified. You can achieve the same goal more concisely by checking the type and using a conditional expression, which also handles edge cases like None or non-string types more cleanly.

        language_part_raw = track.get('language') or track.get('label')
        language_part = str(language_part_raw) if isinstance(language_part_raw, str) else ''

language_part = _LANGUAGE_SANITIZE_PATTERN.sub('-', language_part).strip('-')

if not language_part:
language_part = 'captions' if counter == 1 else f"captions-{counter}"

subtitle_filename = filter_filename(f"{base_name}.{language_part}.{ext}")
if not subtitle_filename:
subtitle_filename = filter_filename(f"{base_name}.captions-{counter}.{ext}")

print(f" [Subs] Queued subtitles: {subtitle_filename}")
add_download_task(url, dest_dir / subtitle_filename, "subtitle")
counter += 1

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This function imports from downloader.py, which in turn imports from wistia_downloader.py, creating a circular dependency. While using local imports makes this work at runtime, it's a code smell that makes the modules tightly coupled and harder to maintain and test.

A better approach would be to refactor this function to not have side effects in another module. Instead of calling add_download_task directly, it could return a list of subtitle download tasks. The caller in downloader.py would then be responsible for adding them to the download queue. This is an example of Inversion of Control.

This would involve:

  1. Renaming queue_wistia_subtitle_downloads to something like get_wistia_subtitle_tasks and making it return a list of task dictionaries.
  2. Passing the settings object to it to check subtitle_download_enabled.
  3. Updating the call sites in downloader.py and wistia_downloader.py to handle the returned list of tasks.

This change would break the dependency from wistia_downloader.py to downloader.py, improving the overall module architecture.



def video_downloader_videoproxy(video_url: str, file_name: str, quality: str = "720p"):
Expand Down Expand Up @@ -143,6 +357,7 @@ def infer_ext(asset: dict) -> str:
return '.mp4'

resolved_base = filter_filename(file_name if file_name else media.get('name') or wistia_id)
current_dir = Path.cwd()

if all_formats_flag:
print(f"Downloading all available Wistia assets for {resolved_base}")
Expand Down Expand Up @@ -172,6 +387,7 @@ def infer_ext(asset: dict) -> str:
DOWNLOAD_MANAGER.download_file(a_url, Path(filter_filename(out_name)))
else:
print("Download manager not initialized")
queue_wistia_subtitle_downloads(media, current_dir, resolved_base)
return

# Single quality path
Expand Down Expand Up @@ -200,6 +416,6 @@ def infer_ext(asset: dict) -> str:

# Queue video for parallel download with absolute path to current directory
from .downloader import add_download_task
current_dir = Path.cwd() # Capture current working directory
full_path = current_dir / resolved_name # Create absolute path
add_download_task(video_url, full_path, "video")
queue_wistia_subtitle_downloads(media, current_dir, resolved_name)