Skip to content

Commit 9b0cabe

Browse files
ringgerclaude
andcommitted
DRY refactor and close test coverage gaps (60% → 80%)
Centralize constants (COMMON_WORDS, MLX_MODEL_MAP, DEFAULT_*) in shared.py, replace numbered pipeline stage labels with named ones, use _should_skip() consistently, and add 81 new tests covering transcriber, transcription, slides, and eval subpackage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 8e82653 commit 9b0cabe

15 files changed

+1166
-164
lines changed

src/transcribe_critic/diarization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def diarize_audio(config: SpeechConfig, data: SpeechData) -> None:
2929
return
3030

3131
print()
32-
print("[2b] Diarizing audio...")
32+
print("[diarize] Diarizing audio...")
3333

3434
diarization_json = config.output_dir / DIARIZATION_JSON
3535
diarized_txt = config.output_dir / DIARIZED_TXT

src/transcribe_critic/download.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
tprint as print,
1414
SpeechConfig, SpeechData,
1515
AUDIO_MP3, METADATA_JSON, CAPTIONS_VTT,
16-
run_command, _save_json, _print_reusing, _dry_run_skip,
16+
run_command, _save_json, _print_reusing, _dry_run_skip, _should_skip,
1717
)
1818

1919

2020
def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) -> None:
2121
"""Download audio, video, and captions using yt-dlp."""
2222
print()
23-
print("[1] Downloading media...")
23+
print("[download] Downloading media...")
2424

2525
output_template = str(config.output_dir / "%(title)s.%(ext)s")
2626

@@ -67,9 +67,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
6767

6868
# Download audio
6969
audio_path = config.output_dir / AUDIO_MP3
70-
if config.skip_existing and audio_path.exists():
71-
_print_reusing(audio_path.name)
72-
elif not _dry_run_skip(config, "download audio", AUDIO_MP3):
70+
if _should_skip(config, audio_path, "download audio"):
71+
pass
72+
else:
7373
print(" Downloading audio...")
7474
run_command(
7575
["yt-dlp", "-x", "--audio-format", "mp3",
@@ -85,9 +85,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
8585
else " Skipping video download (--no-slides)")
8686
else:
8787
video_path = config.output_dir / "video.mp4"
88-
if config.skip_existing and video_path.exists():
89-
_print_reusing(video_path.name)
90-
elif not _dry_run_skip(config, "download video", "video.mp4"):
88+
if _should_skip(config, video_path, "download video"):
89+
pass
90+
else:
9191
print(" Downloading video...")
9292
run_command(
9393
["yt-dlp", "-f", "mp4",
@@ -101,9 +101,9 @@ def download_media(config: SpeechConfig, data: SpeechData, info: dict = None) ->
101101
captions_path = config.output_dir / CAPTIONS_VTT
102102
if config.podcast:
103103
print(" Skipping captions download (--podcast)")
104-
elif config.skip_existing and captions_path.exists():
105-
_print_reusing(captions_path.name)
106-
elif not _dry_run_skip(config, "download captions", CAPTIONS_VTT):
104+
elif _should_skip(config, captions_path, "download captions"):
105+
pass
106+
else:
107107
print(" Downloading captions (if available)...")
108108
try:
109109
run_command(

src/transcribe_critic/merge.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313

1414
from transcribe_critic.shared import (
1515
tprint as print,
16-
SpeechConfig, create_llm_client, llm_call_with_retry, is_up_to_date, _save_json,
16+
SpeechConfig, COMMON_WORDS,
17+
create_llm_client, llm_call_with_retry, is_up_to_date, _save_json,
1718
)
1819

1920

@@ -182,21 +183,21 @@ def _analyze_differences_wdiff(text_a: str, text_b: str, config: SpeechConfig,
182183

183184
def _filter_meaningful_diffs(differences: list) -> list:
184185
"""Filter wdiff differences to only meaningful ones (skip common words)."""
185-
common_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'is', 'it'}
186186
meaningful_diffs = []
187187
for d in differences:
188188
if d["type"] == "changed":
189189
a_words = set(d["a_text"].lower().split())
190190
b_words = set(d["b_text"].lower().split())
191-
if not (a_words <= common_words and b_words <= common_words):
191+
if not (a_words <= COMMON_WORDS and b_words <= COMMON_WORDS):
192192
meaningful_diffs.append(d)
193193
else:
194194
text = d.get("text", "").lower()
195-
if text and text not in common_words:
195+
if text and text not in COMMON_WORDS:
196196
meaningful_diffs.append(d)
197197
return meaningful_diffs
198198

199199

200+
# Matches wdiff markup: [-deleted-], {+inserted+}, or common (unmarked) text.
200201
_WDIFF_TOKEN_PATTERN = re.compile(
201202
r'\[-(?P<deleted>.*?)-\]'
202203
r'|\{\+(?P<inserted>.*?)\+\}'

src/transcribe_critic/output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
def generate_markdown(config: SpeechConfig, data: SpeechData) -> None:
2020
"""Generate markdown document with slides interleaved at correct timestamps."""
2121
print()
22-
print("[5] Generating markdown...")
22+
print("[markdown] Generating markdown...")
2323

2424
markdown_path = config.output_dir / TRANSCRIPT_MD
2525

src/transcribe_critic/shared.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,40 @@ def tprint(*args, **kwargs):
3535
# Whisper model sizes in descending quality order (used for base-model selection)
3636
MODEL_SIZES = ["large", "distil-large-v3", "medium", "small", "base", "tiny"]
3737

38+
# Map model short names to mlx-community HuggingFace model IDs
39+
MLX_MODEL_MAP = {
40+
"distil-large-v3": "mlx-community/distil-whisper-large-v3",
41+
}
42+
43+
# Default LLM model names
44+
DEFAULT_CLAUDE_MODEL = "claude-sonnet-4-20250514"
45+
DEFAULT_LOCAL_MODEL = "qwen2.5:14b"
46+
DEFAULT_LOCAL_VISION_MODEL = "llava"
47+
DEFAULT_OLLAMA_URL = "http://localhost:11434/v1/"
48+
DEFAULT_WHISPER_MODELS = ["small", "medium", "distil-large-v3"]
49+
50+
# Common/stop words for filtering trivial diffs in ensembling and merging
51+
COMMON_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at',
52+
'to', 'for', 'of', 'is', 'it', 'i', 'we', 'he', 'she',
53+
'they', 'you', 'my', 'your', 'his', 'her', 'its', 'our',
54+
'their', 'this', 'that', 'was', 'were', 'be', 'been',
55+
'has', 'have', 'had', 'do', 'does', 'did', 'will',
56+
'would', 'could', 'should', 'may', 'might', 'not', 'no',
57+
'so', 'if', 'then', 'than', 'just', 'also', 'very'}
58+
3859

3960
@dataclass
4061
class SpeechConfig:
4162
"""Configuration for speech transcription pipeline."""
4263
url: str
4364
output_dir: Path
44-
whisper_models: list = field(default_factory=lambda: ["small", "medium", "distil-large-v3"]) # Can be multiple models
65+
whisper_models: list = field(default_factory=lambda: list(DEFAULT_WHISPER_MODELS)) # Can be multiple models
4566
scene_threshold: float = 0.1
4667
analyze_slides: bool = False
4768
merge_sources: bool = True # Merge YouTube captions with Whisper (default: on)
4869
no_llm: bool = False # Skip all LLM-dependent features (merging, ensembling, slide analysis)
4970
api_key: Optional[str] = None
50-
claude_model: str = "claude-sonnet-4-20250514" # Anthropic API model; ignored when local=True (uses local_model)
71+
claude_model: str = DEFAULT_CLAUDE_MODEL # Anthropic API model; ignored when local=True (uses local_model)
5172
skip_existing: bool = True
5273
no_slides: bool = False # Skip slide extraction entirely
5374
podcast: bool = False # Podcast mode: audio-only, skip video + captions
@@ -67,9 +88,9 @@ class SpeechConfig:
6788
api_timeout: float = 120.0 # seconds per API attempt
6889
# Local LLM (default) vs cloud API
6990
local: bool = True # Use local Ollama by default
70-
local_model: str = "qwen2.5:14b" # Default Ollama model for text
71-
local_vision_model: str = "llava" # Default Ollama model for vision
72-
ollama_base_url: str = "http://localhost:11434/v1/"
91+
local_model: str = DEFAULT_LOCAL_MODEL # Default Ollama model for text
92+
local_vision_model: str = DEFAULT_LOCAL_VISION_MODEL # Default Ollama model for vision
93+
ollama_base_url: str = DEFAULT_OLLAMA_URL
7394

7495

7596
# Standard output filenames — single source of truth
@@ -109,6 +130,11 @@ class SpeechData:
109130
metadata: dict = field(default_factory=dict) # Source metadata (title, description, channel, etc.)
110131

111132

133+
def _is_url(s: str) -> bool:
134+
"""Check if a string looks like an HTTP(S) URL."""
135+
return s.startswith(("http://", "https://"))
136+
137+
112138
def is_up_to_date(output: Path, *inputs: Path) -> bool:
113139
"""Check if output file is newer than all input files (make-style)."""
114140
if not output.exists():
@@ -320,7 +346,7 @@ def _collect_source_paths(config: SpeechConfig, data: SpeechData,
320346
paths.append(data.transcript_path)
321347
if data.captions_path and data.captions_path.exists():
322348
paths.append(data.captions_path)
323-
if config.external_transcript and not config.external_transcript.startswith(("http://", "https://")):
349+
if config.external_transcript and not _is_url(config.external_transcript):
324350
ext_path = Path(config.external_transcript)
325351
if ext_path.exists():
326352
paths.append(ext_path)

src/transcribe_critic/slides.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
def extract_slides(config: SpeechConfig, data: SpeechData) -> None:
2424
"""Extract slides from video using ffmpeg scene detection, capturing timestamps."""
2525
print()
26-
print("[3] Extracting slides...")
26+
print("[slides] Extracting slides...")
2727

2828
if not data.video_path or not data.video_path.exists():
2929
print(" No video file available, skipping slide extraction")
@@ -105,7 +105,7 @@ def _load_slide_timestamps(data: SpeechData, timestamps_file: Path) -> None:
105105
def analyze_slides_with_vision(config: SpeechConfig, data: SpeechData) -> None:
106106
"""Analyze slides using Claude vision API."""
107107
print()
108-
print("[4] Analyzing slides with vision API...")
108+
print("[slides] Analyzing slides with vision API...")
109109

110110
if not config.analyze_slides:
111111
print(" Skipped (use --analyze-slides to enable)")

src/transcribe_critic/transcriber.py

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44
==================
55
Automates transcription of speeches from video URLs.
66
7-
Pipeline:
8-
1. Download audio, video, and captions (yt-dlp)
9-
2. Transcribe audio (mlx-whisper or openai-whisper)
10-
3. Extract slides via scene detection (ffmpeg)
11-
4. Optionally analyze slides with vision API (Claude)
12-
4b. Optionally merge YouTube captions + Whisper into "critical text" (wdiff + Claude)
13-
5. Generate markdown with slides interleaved at correct timestamps
7+
Pipeline steps:
8+
download - Download audio, video, and captions (yt-dlp)
9+
transcribe - Transcribe audio (mlx-whisper or openai-whisper)
10+
ensemble - Adjudicate multiple Whisper transcripts (wdiff + LLM)
11+
diarize - Speaker diarization (pyannote, optional)
12+
slides - Extract and optionally analyze slides (ffmpeg + vision LLM)
13+
merge - Merge transcript sources into critical text (wdiff + LLM)
14+
markdown - Generate markdown with slides interleaved at timestamps
15+
analysis - Analyze source survival statistics
1416
1517
Usage:
1618
transcribe-critic <url> [options]
@@ -52,11 +54,14 @@
5254
from transcribe_critic.shared import (
5355
tprint as print,
5456
SpeechConfig, SpeechData, is_up_to_date,
57+
MODEL_SIZES,
58+
DEFAULT_CLAUDE_MODEL, DEFAULT_LOCAL_MODEL, DEFAULT_OLLAMA_URL,
59+
DEFAULT_WHISPER_MODELS,
5560
AUDIO_MP3, AUDIO_WAV, CAPTIONS_VTT, WHISPER_MERGED_TXT,
5661
DIARIZATION_JSON, DIARIZED_TXT, TRANSCRIPT_MERGED_TXT,
5762
ANALYSIS_MD, SLIDE_TIMESTAMPS_JSON,
5863
run_command, _print_reusing, _dry_run_skip, _should_skip,
59-
_collect_source_paths, check_dependencies,
64+
_collect_source_paths, _is_url, check_dependencies,
6065
)
6166

6267
SECTION_SEPARATOR = "=" * 50
@@ -198,7 +203,7 @@ def _load_external_transcript(config: SpeechConfig) -> tuple:
198203
"""
199204
source = config.external_transcript
200205
source_label = source
201-
if source.startswith(("http://", "https://")):
206+
if _is_url(source):
202207
print(f" Fetching external transcript from URL...")
203208
import urllib.request
204209
try:
@@ -306,7 +311,7 @@ def print_cost_estimate(config: SpeechConfig, num_slides: int = 45, transcript_w
306311
def merge_transcript_sources(config: SpeechConfig, data: SpeechData) -> None:
307312
"""Merge transcript sources (Whisper, captions, external) using wdiff alignment and LLM adjudication."""
308313
print()
309-
print("[4b] Merging transcript sources...")
314+
print("[merge] Merging transcript sources...")
310315

311316
if not config.merge_sources:
312317
print(" Skipped (--no-merge flag set)")
@@ -451,7 +456,7 @@ def _strip_structured_headers(text: str) -> str:
451456
def analyze_source_survival(config: SpeechConfig, data: SpeechData) -> None:
452457
"""Analyze how much of each source transcript survived into the merged output."""
453458
print()
454-
print("[6] Analyzing source survival...")
459+
print("[analysis] Analyzing source survival...")
455460

456461
merged_path = config.output_dir / TRANSCRIPT_MERGED_TXT
457462
analysis_path = config.output_dir / ANALYSIS_MD
@@ -595,9 +600,10 @@ def main():
595600

596601
# Whisper
597602
whisper_group = parser.add_argument_group("whisper")
598-
whisper_group.add_argument("--whisper-models", default="small,medium,distil-large-v3",
599-
help="Whisper model(s) to use, comma-separated (default: small,medium,distil-large-v3). "
600-
"Options: tiny, base, small, medium, large, distil-large-v3. "
603+
_default_whisper = ",".join(DEFAULT_WHISPER_MODELS)
604+
whisper_group.add_argument("--whisper-models", default=_default_whisper,
605+
help=f"Whisper model(s) to use, comma-separated (default: {_default_whisper}). "
606+
f"Options: {', '.join(MODEL_SIZES)}. "
601607
"Multiple models enables ensembling for better accuracy")
602608

603609
# Slides
@@ -617,12 +623,12 @@ def main():
617623
help="Use Anthropic Claude API instead of local Ollama (requires API key)")
618624
llm_group.add_argument("--api-key",
619625
help="Anthropic API key (or set ANTHROPIC_API_KEY env var; implies --api)")
620-
llm_group.add_argument("--claude-model", default="claude-sonnet-4-20250514",
621-
help="Claude model for API calls (default: claude-sonnet-4-20250514)")
622-
llm_group.add_argument("--local-model", default="qwen2.5:14b",
623-
help="Ollama model for text tasks (default: qwen2.5:14b)")
624-
llm_group.add_argument("--ollama-url", default="http://localhost:11434/v1/",
625-
help="Ollama server URL (default: http://localhost:11434/v1/)")
626+
llm_group.add_argument("--claude-model", default=DEFAULT_CLAUDE_MODEL,
627+
help=f"Claude model for API calls (default: {DEFAULT_CLAUDE_MODEL})")
628+
llm_group.add_argument("--local-model", default=DEFAULT_LOCAL_MODEL,
629+
help=f"Ollama model for text tasks (default: {DEFAULT_LOCAL_MODEL})")
630+
llm_group.add_argument("--ollama-url", default=DEFAULT_OLLAMA_URL,
631+
help=f"Ollama server URL (default: {DEFAULT_OLLAMA_URL})")
626632
llm_group.add_argument("--no-llm", action="store_true",
627633
help="Skip all LLM-dependent features (merging, ensembling, slide analysis)")
628634
llm_group.add_argument("--no-merge", action="store_true",
@@ -696,11 +702,10 @@ def main():
696702

697703
# Parse whisper models (comma-separated)
698704
whisper_models = [m.strip() for m in args.whisper_models.split(",")]
699-
valid_models = ["tiny", "base", "small", "medium", "large", "distil-large-v3"]
700705
for m in whisper_models:
701-
if m not in valid_models:
706+
if m not in MODEL_SIZES:
702707
print(f"Invalid Whisper model: {m}")
703-
print(f"Valid options: {', '.join(valid_models)}")
708+
print(f"Valid options: {', '.join(MODEL_SIZES)}")
704709
sys.exit(1)
705710

706711
# Determine LLM backend: --api or --api-key switches to cloud API
@@ -749,7 +754,7 @@ def main():
749754

750755
# Validate external transcript source
751756
if config.external_transcript:
752-
if config.external_transcript.startswith(("http://", "https://")):
757+
if _is_url(config.external_transcript):
753758
import urllib.request
754759
try:
755760
req = urllib.request.Request(config.external_transcript, method='HEAD')

0 commit comments

Comments
 (0)