fix: harden transcription processing and guardrails

hammertoe · hammertoe · commit cd7e9d94bbc6 · 2026-03-12T12:47:05.000-04:00
diff --git a/scripts/cron_transcription.py b/scripts/cron_transcription.py
@@ -82,10 +82,14 @@ def get_videos_to_process(self, watchlist: dict[str, Any]) -> list[str]:
         for video_id, video_info in watchlist.get("videos", {}).items():
             last_processed = video_info.get("last_processed", None)
             auto_process = video_info.get("auto_process", False)
+            status = video_info.get("status", "pending")
 
             if not auto_process:
                 continue
 
+            if status == "processed":
+                continue
+
             if not last_processed:
                 videos.append(video_id)
                 continue
@@ -109,16 +113,14 @@ def process_video(
         print(f"Title: {video_title}")
         print(f"{'=' * 80}")
 
-        output_file = f"transcription_output_{video_id}.json"
         matched_order_paper_id = self._auto_match_order_paper(video_id, video_title)
 
         cmd = [
             "python",
             self.transcribe_script,
-            "--video",
-            video_id,
+            "--video=" + video_id,
             "--output-file",
-            output_file,
+            f"transcription_output_{video_id}.json",
             "--segment-minutes",
             str(segment_minutes),
         ]
diff --git a/tests/test_caption_context.py b/tests/test_caption_context.py
@@ -0,0 +1,38 @@
+from datetime import timedelta
+
+from transcribe import build_caption_context, parse_vtt_cues
+
+
+def test_parse_vtt_cues_extracts_entries() -> None:
+    vtt_text = """WEBVTT
+
+00:00:01.000 --> 00:00:03.000
+This session is resumed.
+
+00:00:05.000 --> 00:00:07.000
+We return to the appropriation bill.
+"""
+
+    cues = parse_vtt_cues(vtt_text)
+
+    assert len(cues) == 2
+    assert cues[0][2] == "This session is resumed."
+
+
+def test_build_caption_context_within_segment() -> None:
+    cues = [
+        (1.0, 3.0, "This session is resumed."),
+        (5.0, 7.0, "We return to the appropriation bill."),
+        (20.0, 22.0, "Later content."),
+    ]
+
+    context = build_caption_context(
+        cues,
+        segment_start=timedelta(seconds=4),
+        segment_end=timedelta(seconds=10),
+        buffer_seconds=0,
+        max_chars=1000,
+    )
+
+    assert "appropriation bill" in context
+    assert "Later content" not in context
diff --git a/tests/test_caption_guardrail.py b/tests/test_caption_guardrail.py
@@ -0,0 +1,73 @@
+from transcribe import Transcript, validate_transcript_against_captions
+
+
+def test_guardrail_accepts_similar_text() -> None:
+    transcripts = [
+        Transcript(start="0:00:01", text="This session is resumed prior to suspension.", voice=1)
+    ]
+    result = validate_transcript_against_captions(
+        transcripts,
+        "This session is resumed prior to the suspension.",
+        min_similarity=45.0,
+        max_seconds=None,
+    )
+
+    assert result.status == "ok"
+
+
+def test_guardrail_flags_mismatch() -> None:
+    transcripts = [
+        Transcript(
+            start="0:00:01",
+            text="Quantum entanglement drives photon decoherence in superconducting qubits.",
+            voice=1,
+        )
+    ]
+    result = validate_transcript_against_captions(
+        transcripts,
+        "This session is resumed. Prior to the suspension this chamber was debating.",
+        min_similarity=45.0,
+        max_seconds=None,
+    )
+
+    assert result.status == "mismatch"
+
+
+def test_guardrail_handles_missing_captions() -> None:
+    transcripts = [Transcript(start="0:00:01", text="Some transcript text", voice=1)]
+    result = validate_transcript_against_captions(
+        transcripts,
+        "",
+        min_similarity=45.0,
+        max_seconds=None,
+    )
+
+    assert result.status == "no_captions"
+
+
+def test_guardrail_handles_empty_transcript() -> None:
+    transcripts: list[Transcript] = []
+    result = validate_transcript_against_captions(
+        transcripts,
+        "This session is resumed.",
+        min_similarity=45.0,
+        max_seconds=None,
+    )
+
+    assert result.status == "empty_transcript"
+
+
+def test_guardrail_respects_max_seconds() -> None:
+    transcripts = [
+        Transcript(start="0:00:05", text="This session is resumed.", voice=1),
+        Transcript(start="0:15:00", text="Unrelated later text.", voice=1),
+    ]
+
+    result = validate_transcript_against_captions(
+        transcripts,
+        "This session is resumed.",
+        min_similarity=45.0,
+        max_seconds=600,
+    )
+
+    assert result.status == "ok"
diff --git a/tests/test_timecode_parsing.py b/tests/test_timecode_parsing.py
@@ -0,0 +1,15 @@
+from datetime import timedelta
+
+from transcribe import parse_timecode_to_timedelta
+
+
+def test_parse_timecode_accepts_colon_milliseconds() -> None:
+    result = parse_timecode_to_timedelta("1:0:16:600")
+
+    assert result == timedelta(hours=1, minutes=0, seconds=16, milliseconds=600)
+
+
+def test_parse_timecode_accepts_dot_milliseconds() -> None:
+    result = parse_timecode_to_timedelta("1:00:16.600")
+
+    assert result == timedelta(hours=1, minutes=0, seconds=16, milliseconds=600)
diff --git a/tests/test_transcribe_timecode_normalization.py b/tests/test_transcribe_timecode_normalization.py
@@ -0,0 +1,29 @@
+from datetime import timedelta
+
+from transcribe import Transcript, normalize_segment_transcript_timecodes
+
+
+def test_normalize_segment_timecodes_offsets_relative_times() -> None:
+    transcripts = [
+        Transcript(start="0:00:05", text="hello", voice=1),
+        Transcript(start="0:08:26", text="world", voice=1),
+    ]
+
+    normalized = normalize_segment_transcript_timecodes(
+        transcripts, segment_start=timedelta(minutes=29)
+    )
+
+    assert normalized[0].start == "0:29:05"
+
+
+def test_normalize_segment_timecodes_keeps_absolute_times() -> None:
+    transcripts = [
+        Transcript(start="0:29:05", text="hello", voice=1),
+        Transcript(start="0:30:00", text="world", voice=1),
+    ]
+
+    normalized = normalize_segment_transcript_timecodes(
+        transcripts, segment_start=timedelta(minutes=29)
+    )
+
+    assert normalized[0].start == "0:29:05"
diff --git a/transcribe.py b/transcribe.py