Skip to content

Commit cd7e9d9

Browse files
committed
fix: harden transcription processing and guardrails
1 parent fe15510 commit cd7e9d9

File tree

6 files changed

+540
-12
lines changed

6 files changed

+540
-12
lines changed

scripts/cron_transcription.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,14 @@ def get_videos_to_process(self, watchlist: dict[str, Any]) -> list[str]:
8282
for video_id, video_info in watchlist.get("videos", {}).items():
8383
last_processed = video_info.get("last_processed", None)
8484
auto_process = video_info.get("auto_process", False)
85+
status = video_info.get("status", "pending")
8586

8687
if not auto_process:
8788
continue
8889

90+
if status == "processed":
91+
continue
92+
8993
if not last_processed:
9094
videos.append(video_id)
9195
continue
@@ -109,16 +113,14 @@ def process_video(
109113
print(f"Title: {video_title}")
110114
print(f"{'=' * 80}")
111115

112-
output_file = f"transcription_output_{video_id}.json"
113116
matched_order_paper_id = self._auto_match_order_paper(video_id, video_title)
114117

115118
cmd = [
116119
"python",
117120
self.transcribe_script,
118-
"--video",
119-
video_id,
121+
"--video=" + video_id,
120122
"--output-file",
121-
output_file,
123+
f"transcription_output_{video_id}.json",
122124
"--segment-minutes",
123125
str(segment_minutes),
124126
]

tests/test_caption_context.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from datetime import timedelta
2+
3+
from transcribe import build_caption_context, parse_vtt_cues
4+
5+
6+
def test_parse_vtt_cues_extracts_entries() -> None:
7+
vtt_text = """WEBVTT
8+
9+
00:00:01.000 --> 00:00:03.000
10+
This session is resumed.
11+
12+
00:00:05.000 --> 00:00:07.000
13+
We return to the appropriation bill.
14+
"""
15+
16+
cues = parse_vtt_cues(vtt_text)
17+
18+
assert len(cues) == 2
19+
assert cues[0][2] == "This session is resumed."
20+
21+
22+
def test_build_caption_context_within_segment() -> None:
23+
cues = [
24+
(1.0, 3.0, "This session is resumed."),
25+
(5.0, 7.0, "We return to the appropriation bill."),
26+
(20.0, 22.0, "Later content."),
27+
]
28+
29+
context = build_caption_context(
30+
cues,
31+
segment_start=timedelta(seconds=4),
32+
segment_end=timedelta(seconds=10),
33+
buffer_seconds=0,
34+
max_chars=1000,
35+
)
36+
37+
assert "appropriation bill" in context
38+
assert "Later content" not in context

tests/test_caption_guardrail.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
from transcribe import Transcript, validate_transcript_against_captions
2+
3+
4+
def test_guardrail_accepts_similar_text() -> None:
5+
transcripts = [
6+
Transcript(start="0:00:01", text="This session is resumed prior to suspension.", voice=1)
7+
]
8+
result = validate_transcript_against_captions(
9+
transcripts,
10+
"This session is resumed prior to the suspension.",
11+
min_similarity=45.0,
12+
max_seconds=None,
13+
)
14+
15+
assert result.status == "ok"
16+
17+
18+
def test_guardrail_flags_mismatch() -> None:
19+
transcripts = [
20+
Transcript(
21+
start="0:00:01",
22+
text="Quantum entanglement drives photon decoherence in superconducting qubits.",
23+
voice=1,
24+
)
25+
]
26+
result = validate_transcript_against_captions(
27+
transcripts,
28+
"This session is resumed. Prior to the suspension this chamber was debating.",
29+
min_similarity=45.0,
30+
max_seconds=None,
31+
)
32+
33+
assert result.status == "mismatch"
34+
35+
36+
def test_guardrail_handles_missing_captions() -> None:
37+
transcripts = [Transcript(start="0:00:01", text="Some transcript text", voice=1)]
38+
result = validate_transcript_against_captions(
39+
transcripts,
40+
"",
41+
min_similarity=45.0,
42+
max_seconds=None,
43+
)
44+
45+
assert result.status == "no_captions"
46+
47+
48+
def test_guardrail_handles_empty_transcript() -> None:
49+
transcripts: list[Transcript] = []
50+
result = validate_transcript_against_captions(
51+
transcripts,
52+
"This session is resumed.",
53+
min_similarity=45.0,
54+
max_seconds=None,
55+
)
56+
57+
assert result.status == "empty_transcript"
58+
59+
60+
def test_guardrail_respects_max_seconds() -> None:
61+
transcripts = [
62+
Transcript(start="0:00:05", text="This session is resumed.", voice=1),
63+
Transcript(start="0:15:00", text="Unrelated later text.", voice=1),
64+
]
65+
66+
result = validate_transcript_against_captions(
67+
transcripts,
68+
"This session is resumed.",
69+
min_similarity=45.0,
70+
max_seconds=600,
71+
)
72+
73+
assert result.status == "ok"

tests/test_timecode_parsing.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from datetime import timedelta
2+
3+
from transcribe import parse_timecode_to_timedelta
4+
5+
6+
def test_parse_timecode_accepts_colon_milliseconds() -> None:
7+
result = parse_timecode_to_timedelta("1:0:16:600")
8+
9+
assert result == timedelta(hours=1, minutes=0, seconds=16, milliseconds=600)
10+
11+
12+
def test_parse_timecode_accepts_dot_milliseconds() -> None:
13+
result = parse_timecode_to_timedelta("1:00:16.600")
14+
15+
assert result == timedelta(hours=1, minutes=0, seconds=16, milliseconds=600)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from datetime import timedelta
2+
3+
from transcribe import Transcript, normalize_segment_transcript_timecodes
4+
5+
6+
def test_normalize_segment_timecodes_offsets_relative_times() -> None:
7+
transcripts = [
8+
Transcript(start="0:00:05", text="hello", voice=1),
9+
Transcript(start="0:08:26", text="world", voice=1),
10+
]
11+
12+
normalized = normalize_segment_transcript_timecodes(
13+
transcripts, segment_start=timedelta(minutes=29)
14+
)
15+
16+
assert normalized[0].start == "0:29:05"
17+
18+
19+
def test_normalize_segment_timecodes_keeps_absolute_times() -> None:
20+
transcripts = [
21+
Transcript(start="0:29:05", text="hello", voice=1),
22+
Transcript(start="0:30:00", text="world", voice=1),
23+
]
24+
25+
normalized = normalize_segment_transcript_timecodes(
26+
transcripts, segment_start=timedelta(minutes=29)
27+
)
28+
29+
assert normalized[0].start == "0:29:05"

0 commit comments

Comments
 (0)