44==================
55Automates transcription of speeches from video URLs.
66
7- Pipeline:
8- 1. Download audio, video, and captions (yt-dlp)
9- 2. Transcribe audio (mlx-whisper or openai-whisper)
10- 3. Extract slides via scene detection (ffmpeg)
11- 4. Optionally analyze slides with vision API (Claude)
12- 4b. Optionally merge YouTube captions + Whisper into "critical text" (wdiff + Claude)
13- 5. Generate markdown with slides interleaved at correct timestamps
7+ Pipeline steps:
8+ download - Download audio, video, and captions (yt-dlp)
9+ transcribe - Transcribe audio (mlx-whisper or openai-whisper)
10+ ensemble - Adjudicate multiple Whisper transcripts (wdiff + LLM)
11+ diarize - Speaker diarization (pyannote, optional)
12+ slides - Extract and optionally analyze slides (ffmpeg + vision LLM)
13+ merge - Merge transcript sources into critical text (wdiff + LLM)
14+ markdown - Generate markdown with slides interleaved at timestamps
15+ analysis - Analyze source survival statistics
1416
1517Usage:
1618 transcribe-critic <url> [options]
5254from transcribe_critic .shared import (
5355 tprint as print ,
5456 SpeechConfig , SpeechData , is_up_to_date ,
57+ MODEL_SIZES ,
58+ DEFAULT_CLAUDE_MODEL , DEFAULT_LOCAL_MODEL , DEFAULT_OLLAMA_URL ,
59+ DEFAULT_WHISPER_MODELS ,
5560 AUDIO_MP3 , AUDIO_WAV , CAPTIONS_VTT , WHISPER_MERGED_TXT ,
5661 DIARIZATION_JSON , DIARIZED_TXT , TRANSCRIPT_MERGED_TXT ,
5762 ANALYSIS_MD , SLIDE_TIMESTAMPS_JSON ,
5863 run_command , _print_reusing , _dry_run_skip , _should_skip ,
59- _collect_source_paths , check_dependencies ,
64+ _collect_source_paths , _is_url , check_dependencies ,
6065)
6166
6267SECTION_SEPARATOR = "=" * 50
@@ -198,7 +203,7 @@ def _load_external_transcript(config: SpeechConfig) -> tuple:
198203 """
199204 source = config .external_transcript
200205 source_label = source
201- if source . startswith (( "http://" , "https://" ) ):
206+ if _is_url ( source ):
202207 print (f" Fetching external transcript from URL..." )
203208 import urllib .request
204209 try :
@@ -306,7 +311,7 @@ def print_cost_estimate(config: SpeechConfig, num_slides: int = 45, transcript_w
306311def merge_transcript_sources (config : SpeechConfig , data : SpeechData ) -> None :
307312 """Merge transcript sources (Whisper, captions, external) using wdiff alignment and LLM adjudication."""
308313 print ()
309- print ("[4b ] Merging transcript sources..." )
314+ print ("[merge ] Merging transcript sources..." )
310315
311316 if not config .merge_sources :
312317 print (" Skipped (--no-merge flag set)" )
@@ -451,7 +456,7 @@ def _strip_structured_headers(text: str) -> str:
451456def analyze_source_survival (config : SpeechConfig , data : SpeechData ) -> None :
452457 """Analyze how much of each source transcript survived into the merged output."""
453458 print ()
454- print ("[6 ] Analyzing source survival..." )
459+ print ("[analysis ] Analyzing source survival..." )
455460
456461 merged_path = config .output_dir / TRANSCRIPT_MERGED_TXT
457462 analysis_path = config .output_dir / ANALYSIS_MD
@@ -595,9 +600,10 @@ def main():
595600
596601 # Whisper
597602 whisper_group = parser .add_argument_group ("whisper" )
598- whisper_group .add_argument ("--whisper-models" , default = "small,medium,distil-large-v3" ,
599- help = "Whisper model(s) to use, comma-separated (default: small,medium,distil-large-v3). "
600- "Options: tiny, base, small, medium, large, distil-large-v3. "
603+ _default_whisper = "," .join (DEFAULT_WHISPER_MODELS )
604+ whisper_group .add_argument ("--whisper-models" , default = _default_whisper ,
605+ help = f"Whisper model(s) to use, comma-separated (default: { _default_whisper } ). "
606+ f"Options: { ', ' .join (MODEL_SIZES )} . "
601607 "Multiple models enables ensembling for better accuracy" )
602608
603609 # Slides
@@ -617,12 +623,12 @@ def main():
617623 help = "Use Anthropic Claude API instead of local Ollama (requires API key)" )
618624 llm_group .add_argument ("--api-key" ,
619625 help = "Anthropic API key (or set ANTHROPIC_API_KEY env var; implies --api)" )
620- llm_group .add_argument ("--claude-model" , default = "claude-sonnet-4-20250514" ,
621- help = "Claude model for API calls (default: claude-sonnet-4-20250514 )" )
622- llm_group .add_argument ("--local-model" , default = "qwen2.5:14b" ,
623- help = "Ollama model for text tasks (default: qwen2.5:14b )" )
624- llm_group .add_argument ("--ollama-url" , default = "http://localhost:11434/v1/" ,
625- help = "Ollama server URL (default: http://localhost:11434/v1/ )" )
626+ llm_group .add_argument ("--claude-model" , default = DEFAULT_CLAUDE_MODEL ,
627+ help = f "Claude model for API calls (default: { DEFAULT_CLAUDE_MODEL } )" )
628+ llm_group .add_argument ("--local-model" , default = DEFAULT_LOCAL_MODEL ,
629+ help = f "Ollama model for text tasks (default: { DEFAULT_LOCAL_MODEL } )" )
630+ llm_group .add_argument ("--ollama-url" , default = DEFAULT_OLLAMA_URL ,
631+ help = f "Ollama server URL (default: { DEFAULT_OLLAMA_URL } )" )
626632 llm_group .add_argument ("--no-llm" , action = "store_true" ,
627633 help = "Skip all LLM-dependent features (merging, ensembling, slide analysis)" )
628634 llm_group .add_argument ("--no-merge" , action = "store_true" ,
@@ -696,11 +702,10 @@ def main():
696702
697703 # Parse whisper models (comma-separated)
698704 whisper_models = [m .strip () for m in args .whisper_models .split ("," )]
699- valid_models = ["tiny" , "base" , "small" , "medium" , "large" , "distil-large-v3" ]
700705 for m in whisper_models :
701- if m not in valid_models :
706+ if m not in MODEL_SIZES :
702707 print (f"Invalid Whisper model: { m } " )
703- print (f"Valid options: { ', ' .join (valid_models )} " )
708+ print (f"Valid options: { ', ' .join (MODEL_SIZES )} " )
704709 sys .exit (1 )
705710
706711 # Determine LLM backend: --api or --api-key switches to cloud API
@@ -749,7 +754,7 @@ def main():
749754
750755 # Validate external transcript source
751756 if config .external_transcript :
752- if config .external_transcript . startswith (( "http://" , "https://" ) ):
757+ if _is_url ( config .external_transcript ):
753758 import urllib .request
754759 try :
755760 req = urllib .request .Request (config .external_transcript , method = 'HEAD' )
0 commit comments