ringger
diff --git a/‎src/transcribe_critic/diarization.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transcribe_critic/diarization.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transcribe_critic/download.py‎
Lines changed: 11 additions & 11 deletions b/‎src/transcribe_critic/download.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎src/transcribe_critic/merge.py‎
Lines changed: 5 additions & 4 deletions b/‎src/transcribe_critic/merge.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/transcribe_critic/output.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transcribe_critic/output.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transcribe_critic/shared.py‎
Lines changed: 32 additions & 6 deletions b/‎src/transcribe_critic/shared.py‎
Lines changed: 32 additions & 6 deletions
diff --git a/‎src/transcribe_critic/slides.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transcribe_critic/slides.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/transcribe_critic/transcriber.py‎
Lines changed: 29 additions & 24 deletions b/‎src/transcribe_critic/transcriber.py‎
Lines changed: 29 additions & 24 deletions
@@ -29,7 +29,7 @@ def diarize_audio(config: SpeechConfig, data: SpeechData) -> None:
         return
 
     print()
-    print("[2b] Diarizing audio...")
+    print("[diarize] Diarizing audio...")
 
     diarization_json = config.output_dir / DIARIZATION_JSON
     diarized_txt = config.output_dir / DIARIZED_TXT
 
@@ -13,14 +13,14 @@
     tprint as print,
     SpeechConfig, SpeechData,
     AUDIO_MP3, METADATA_JSON, CAPTIONS_VTT,
-    run_command, _save_json, _print_reusing, _dry_run_skip,
+    run_command, _save_json, _print_reusing, _dry_run_skip, _should_skip,
 )
 
 
 def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) -> None:
     """Download audio, video, and captions using yt-dlp."""
     print()
-    print("[1] Downloading media...")
+    print("[download] Downloading media...")
 
     output_template = str(config.output_dir / "%(title)s.%(ext)s")
 
@@ -67,9 +67,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
 
     # Download audio
     audio_path = config.output_dir / AUDIO_MP3
-    if config.skip_existing and audio_path.exists():
-        _print_reusing(audio_path.name)
-    elif not _dry_run_skip(config, "download audio", AUDIO_MP3):
+    if _should_skip(config, audio_path, "download audio"):
+        pass
+    else:
         print("  Downloading audio...")
         run_command(
             ["yt-dlp", "-x", "--audio-format", "mp3",
@@ -85,9 +85,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
               else "  Skipping video download (--no-slides)")
     else:
         video_path = config.output_dir / "video.mp4"
-        if config.skip_existing and video_path.exists():
-            _print_reusing(video_path.name)
-        elif not _dry_run_skip(config, "download video", "video.mp4"):
+        if _should_skip(config, video_path, "download video"):
+            pass
+        else:
             print("  Downloading video...")
             run_command(
                 ["yt-dlp", "-f", "mp4",
@@ -101,9 +101,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
     captions_path = config.output_dir / CAPTIONS_VTT
     if config.podcast:
         print("  Skipping captions download (--podcast)")
-    elif config.skip_existing and captions_path.exists():
-        _print_reusing(captions_path.name)
-    elif not _dry_run_skip(config, "download captions", CAPTIONS_VTT):
+    elif _should_skip(config, captions_path, "download captions"):
+        pass
+    else:
         print("  Downloading captions (if available)...")
         try:
             run_command(
 
@@ -13,7 +13,8 @@
 
 from transcribe_critic.shared import (
     tprint as print,
-    SpeechConfig, create_llm_client, llm_call_with_retry, is_up_to_date, _save_json,
+    SpeechConfig, COMMON_WORDS,
+    create_llm_client, llm_call_with_retry, is_up_to_date, _save_json,
 )
 
 
@@ -182,21 +183,21 @@ def _analyze_differences_wdiff(text_a: str, text_b: str, config: SpeechConfig,
 
 def _filter_meaningful_diffs(differences: list) -> list:
     """Filter wdiff differences to only meaningful ones (skip common words)."""
-    common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'is', 'it'}
     meaningful_diffs = []
     for d in differences:
         if d["type"] == "changed":
             a_words = set(d["a_text"].lower().split())
             b_words = set(d["b_text"].lower().split())
-            if not (a_words <= common_words and b_words <= common_words):
+            if not (a_words <= COMMON_WORDS and b_words <= COMMON_WORDS):
                 meaningful_diffs.append(d)
         else:
             text = d.get("text", "").lower()
-            if text and text not in common_words:
+            if text and text not in COMMON_WORDS:
                 meaningful_diffs.append(d)
     return meaningful_diffs
 
 
+# Matches wdiff markup: [-deleted-], {+inserted+}, or common (unmarked) text.
 _WDIFF_TOKEN_PATTERN = re.compile(
     r'\[-(?P<deleted>.*?)-\]'
     r'|\{\+(?P<inserted>.*?)\+\}'
 
@@ -19,7 +19,7 @@
 def generate_markdown(config: SpeechConfig, data: SpeechData) -> None:
     """Generate markdown document with slides interleaved at correct timestamps."""
     print()
-    print("[5] Generating markdown...")
+    print("[markdown] Generating markdown...")
 
     markdown_path = config.output_dir / TRANSCRIPT_MD
 
 
@@ -35,19 +35,40 @@ def tprint(*args, **kwargs):
 # Whisper model sizes in descending quality order (used for base-model selection)
 MODEL_SIZES = ["large", "distil-large-v3", "medium", "small", "base", "tiny"]
 
+# Map model short names to mlx-community HuggingFace model IDs
+MLX_MODEL_MAP = {
+    "distil-large-v3": "mlx-community/distil-whisper-large-v3",
+}
+
+# Default LLM model names
+DEFAULT_CLAUDE_MODEL = "claude-sonnet-4-20250514"
+DEFAULT_LOCAL_MODEL = "qwen2.5:14b"
+DEFAULT_LOCAL_VISION_MODEL = "llava"
+DEFAULT_OLLAMA_URL = "http://localhost:11434/v1/"
+DEFAULT_WHISPER_MODELS = ["small", "medium", "distil-large-v3"]
+
+# Common/stop words for filtering trivial diffs in ensembling and merging
+COMMON_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at',
+                'to', 'for', 'of', 'is', 'it', 'i', 'we', 'he', 'she',
+                'they', 'you', 'my', 'your', 'his', 'her', 'its', 'our',
+                'their', 'this', 'that', 'was', 'were', 'be', 'been',
+                'has', 'have', 'had', 'do', 'does', 'did', 'will',
+                'would', 'could', 'should', 'may', 'might', 'not', 'no',
+                'so', 'if', 'then', 'than', 'just', 'also', 'very'}
+
 
 @dataclass
 class SpeechConfig:
     """Configuration for speech transcription pipeline."""
     url: str
     output_dir: Path
-    whisper_models: list = field(default_factory=lambda: ["small", "medium", "distil-large-v3"])  # Can be multiple models
+    whisper_models: list = field(default_factory=lambda: list(DEFAULT_WHISPER_MODELS))  # Can be multiple models
     scene_threshold: float = 0.1
     analyze_slides: bool = False
     merge_sources: bool = True  # Merge YouTube captions with Whisper (default: on)
     no_llm: bool = False  # Skip all LLM-dependent features (merging, ensembling, slide analysis)
     api_key: Optional[str] = None
-    claude_model: str = "claude-sonnet-4-20250514"  # Anthropic API model; ignored when local=True (uses local_model)
+    claude_model: str = DEFAULT_CLAUDE_MODEL  # Anthropic API model; ignored when local=True (uses local_model)
     skip_existing: bool = True
     no_slides: bool = False  # Skip slide extraction entirely
     podcast: bool = False  # Podcast mode: audio-only, skip video + captions
@@ -67,9 +88,9 @@ class SpeechConfig:
     api_timeout: float = 120.0  # seconds per API attempt
     # Local LLM (default) vs cloud API
     local: bool = True  # Use local Ollama by default
-    local_model: str = "qwen2.5:14b"  # Default Ollama model for text
-    local_vision_model: str = "llava"  # Default Ollama model for vision
-    ollama_base_url: str = "http://localhost:11434/v1/"
+    local_model: str = DEFAULT_LOCAL_MODEL  # Default Ollama model for text
+    local_vision_model: str = DEFAULT_LOCAL_VISION_MODEL  # Default Ollama model for vision
+    ollama_base_url: str = DEFAULT_OLLAMA_URL
 
 
 # Standard output filenames — single source of truth
@@ -109,6 +130,11 @@ class SpeechData:
     metadata: dict = field(default_factory=dict)  # Source metadata (title, description, channel, etc.)
 
 
+def _is_url(s: str) -> bool:
+    """Check if a string looks like an HTTP(S) URL."""
+    return s.startswith(("http://", "https://"))
+
+
 def is_up_to_date(output: Path, *inputs: Path) -> bool:
     """Check if output file is newer than all input files (make-style)."""
     if not output.exists():
@@ -320,7 +346,7 @@ def _collect_source_paths(config: SpeechConfig, data: SpeechData,
         paths.append(data.transcript_path)
     if data.captions_path and data.captions_path.exists():
         paths.append(data.captions_path)
-    if config.external_transcript and not config.external_transcript.startswith(("http://", "https://")):
+    if config.external_transcript and not _is_url(config.external_transcript):
         ext_path = Path(config.external_transcript)
         if ext_path.exists():
             paths.append(ext_path)
 
@@ -23,7 +23,7 @@
 def extract_slides(config: SpeechConfig, data: SpeechData) -> None:
     """Extract slides from video using ffmpeg scene detection, capturing timestamps."""
     print()
-    print("[3] Extracting slides...")
+    print("[slides] Extracting slides...")
 
     if not data.video_path or not data.video_path.exists():
         print("  No video file available, skipping slide extraction")
@@ -105,7 +105,7 @@ def _load_slide_timestamps(data: SpeechData, timestamps_file: Path) -> None:
 def analyze_slides_with_vision(config: SpeechConfig, data: SpeechData) -> None:
     """Analyze slides using Claude vision API."""
     print()
-    print("[4] Analyzing slides with vision API...")
+    print("[slides] Analyzing slides with vision API...")
 
     if not config.analyze_slides:
         print("  Skipped (use --analyze-slides to enable)")
 
@@ -4,13 +4,15 @@
 ==================
 Automates transcription of speeches from video URLs.
 
-Pipeline:
-1. Download audio, video, and captions (yt-dlp)
-2. Transcribe audio (mlx-whisper or openai-whisper)
-3. Extract slides via scene detection (ffmpeg)
-4. Optionally analyze slides with vision API (Claude)
-4b. Optionally merge YouTube captions + Whisper into "critical text" (wdiff + Claude)
-5. Generate markdown with slides interleaved at correct timestamps
+Pipeline steps:
+  download   - Download audio, video, and captions (yt-dlp)
+  transcribe - Transcribe audio (mlx-whisper or openai-whisper)
+  ensemble   - Adjudicate multiple Whisper transcripts (wdiff + LLM)
+  diarize    - Speaker diarization (pyannote, optional)
+  slides     - Extract and optionally analyze slides (ffmpeg + vision LLM)
+  merge      - Merge transcript sources into critical text (wdiff + LLM)
+  markdown   - Generate markdown with slides interleaved at timestamps
+  analysis   - Analyze source survival statistics
 
 Usage:
     transcribe-critic <url> [options]
@@ -52,11 +54,14 @@
 from transcribe_critic.shared import (
     tprint as print,
     SpeechConfig, SpeechData, is_up_to_date,
+    MODEL_SIZES,
+    DEFAULT_CLAUDE_MODEL, DEFAULT_LOCAL_MODEL, DEFAULT_OLLAMA_URL,
+    DEFAULT_WHISPER_MODELS,
     AUDIO_MP3, AUDIO_WAV, CAPTIONS_VTT, WHISPER_MERGED_TXT,
     DIARIZATION_JSON, DIARIZED_TXT, TRANSCRIPT_MERGED_TXT,
     ANALYSIS_MD, SLIDE_TIMESTAMPS_JSON,
     run_command, _print_reusing, _dry_run_skip, _should_skip,
-    _collect_source_paths, check_dependencies,
+    _collect_source_paths, _is_url, check_dependencies,
 )
 
 SECTION_SEPARATOR = "=" * 50
@@ -198,7 +203,7 @@ def _load_external_transcript(config: SpeechConfig) -> tuple:
     """
     source = config.external_transcript
     source_label = source
-    if source.startswith(("http://", "https://")):
+    if _is_url(source):
         print(f"  Fetching external transcript from URL...")
         import urllib.request
         try:
@@ -306,7 +311,7 @@ def print_cost_estimate(config: SpeechConfig, num_slides: int = 45, transcript_w
 def merge_transcript_sources(config: SpeechConfig, data: SpeechData) -> None:
     """Merge transcript sources (Whisper, captions, external) using wdiff alignment and LLM adjudication."""
     print()
-    print("[4b] Merging transcript sources...")
+    print("[merge] Merging transcript sources...")
 
     if not config.merge_sources:
         print("  Skipped (--no-merge flag set)")
@@ -451,7 +456,7 @@ def _strip_structured_headers(text: str) -> str:
 def analyze_source_survival(config: SpeechConfig, data: SpeechData) -> None:
     """Analyze how much of each source transcript survived into the merged output."""
     print()
-    print("[6] Analyzing source survival...")
+    print("[analysis] Analyzing source survival...")
 
     merged_path = config.output_dir / TRANSCRIPT_MERGED_TXT
     analysis_path = config.output_dir / ANALYSIS_MD
@@ -595,9 +600,10 @@ def main():
 
     # Whisper
     whisper_group = parser.add_argument_group("whisper")
-    whisper_group.add_argument("--whisper-models", default="small,medium,distil-large-v3",
-                        help="Whisper model(s) to use, comma-separated (default: small,medium,distil-large-v3). "
-                             "Options: tiny, base, small, medium, large, distil-large-v3. "
+    _default_whisper = ",".join(DEFAULT_WHISPER_MODELS)
+    whisper_group.add_argument("--whisper-models", default=_default_whisper,
+                        help=f"Whisper model(s) to use, comma-separated (default: {_default_whisper}). "
+                             f"Options: {', '.join(MODEL_SIZES)}. "
                              "Multiple models enables ensembling for better accuracy")
 
     # Slides
@@ -617,12 +623,12 @@ def main():
                         help="Use Anthropic Claude API instead of local Ollama (requires API key)")
     llm_group.add_argument("--api-key",
                         help="Anthropic API key (or set ANTHROPIC_API_KEY env var; implies --api)")
-    llm_group.add_argument("--claude-model", default="claude-sonnet-4-20250514",
-                        help="Claude model for API calls (default: claude-sonnet-4-20250514)")
-    llm_group.add_argument("--local-model", default="qwen2.5:14b",
-                        help="Ollama model for text tasks (default: qwen2.5:14b)")
-    llm_group.add_argument("--ollama-url", default="http://localhost:11434/v1/",
-                        help="Ollama server URL (default: http://localhost:11434/v1/)")
+    llm_group.add_argument("--claude-model", default=DEFAULT_CLAUDE_MODEL,
+                        help=f"Claude model for API calls (default: {DEFAULT_CLAUDE_MODEL})")
+    llm_group.add_argument("--local-model", default=DEFAULT_LOCAL_MODEL,
+                        help=f"Ollama model for text tasks (default: {DEFAULT_LOCAL_MODEL})")
+    llm_group.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL,
+                        help=f"Ollama server URL (default: {DEFAULT_OLLAMA_URL})")
     llm_group.add_argument("--no-llm", action="store_true",
                         help="Skip all LLM-dependent features (merging, ensembling, slide analysis)")
     llm_group.add_argument("--no-merge", action="store_true",
@@ -696,11 +702,10 @@ def main():
 
     # Parse whisper models (comma-separated)
     whisper_models = [m.strip() for m in args.whisper_models.split(",")]
-    valid_models = ["tiny", "base", "small", "medium", "large", "distil-large-v3"]
     for m in whisper_models:
-        if m not in valid_models:
+        if m not in MODEL_SIZES:
             print(f"Invalid Whisper model: {m}")
-            print(f"Valid options: {', '.join(valid_models)}")
+            print(f"Valid options: {', '.join(MODEL_SIZES)}")
             sys.exit(1)
 
     # Determine LLM backend: --api or --api-key switches to cloud API
@@ -749,7 +754,7 @@ def main():
 
     # Validate external transcript source
     if config.external_transcript:
-        if config.external_transcript.startswith(("http://", "https://")):
+        if _is_url(config.external_transcript):
             import urllib.request
             try:
                 req = urllib.request.Request(config.external_transcript, method='HEAD')