Skip to content

Commit cfff898

Browse files
committed
use lists of urls for transcripts and subtitiles
1 parent b000516 commit cfff898

File tree

8 files changed

+57
-133
lines changed

8 files changed

+57
-133
lines changed

db/queries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ def get_meetings(days: int = 7) -> Sequence[Meeting]:
1414
A.date >= target_date,
1515
)
1616

17-
return Sequence(meetings)
17+
return list(meetings)

flows/add_subtitles.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,49 @@
11
from prefect import flow
2+
from dyntastic import A
3+
from pydantic import HttpUrl
4+
5+
from src.models.subtitles import Transcript
26

37
from tasks.subtitles import create_vtt_track
48
from db.queries import get_meetings
5-
6-
from src.models.subtitles import SubtitleTrack, TrackFormat
79
from src.aws import save_content_to_s3
10+
from src.http_utils import async_get_json
811

912

1013
@flow(log_prints=True)
1114
async def add_subtitles():
1215
meetings = get_meetings(days=90)
13-
meetings_with_transcript = [
16+
meetings_with_transcripts = [
1417
meeting
1518
for meeting in meetings
16-
if hasattr(meeting, "transcript") and meeting.transcript
19+
if hasattr(meeting, "transcripts") and meeting.transcripts is not None or []
1720
]
18-
for meeting in meetings_with_transcript:
19-
if not meeting.subtitles:
21+
for meeting in meetings_with_transcripts:
22+
for transcript_url in meeting.transcripts:
23+
transcript_data = await async_get_json(transcript_url.encoded_string())
24+
transcript = Transcript.model_validate(transcript_data)
25+
language = transcript.language
26+
if f"{language}.vtt" in meeting.subtitles:
27+
continue
2028
track_content = await create_vtt_track(
21-
meeting.transcript,
29+
transcript,
2230
include_speaker_prefix=False,
2331
)
24-
save_content_to_s3(
32+
result: HttpUrl = save_content_to_s3(
2533
track_content,
2634
"tgov-assets",
27-
f"{meeting.filename()}.subtitles.vtt",
35+
f"{meeting.filename()}/subtitles/{language}.vtt",
2836
"text/vtt",
2937
)
30-
return track_content
38+
if not meeting.subtitles:
39+
meeting.subtitles = [result]
40+
else:
41+
(
42+
meeting.subtitles.append(result)
43+
if result not in meeting.subtitles
44+
else None
45+
)
46+
meeting.save()
3147

3248

3349
if __name__ == "__main__":

notebooks/srt_subtitles.ipynb

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"sys.path.append(\"../\")\n",
2929
"\n",
3030
"# Import the necessary modules from the new subtitles package\n",
31-
"from src.subtitles import create_track"
31+
"from src.subtitles import create_subtitles"
3232
]
3333
},
3434
{
@@ -51,9 +51,9 @@
5151
" \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
5252
")\n",
5353
"# Create an SRT track from the transcript\n",
54-
"srt_track = create_track(\n",
54+
"srt_track = create_subtitles(\n",
5555
" transcript_data=transcript_file,\n",
56-
" format='srt',\n",
56+
" format=\"srt\",\n",
5757
" max_duration=5.0, # Maximum duration for each subtitle\n",
5858
" max_length=80, # Maximum length in characters\n",
5959
" include_speaker_prefix=True, # Include speaker labels\n",
@@ -120,7 +120,7 @@
120120
"outputs": [],
121121
"source": [
122122
"# Import the subtitles module\n",
123-
"from src.subtitles import create_track\n",
123+
"from src.subtitles import create_subtitles\n",
124124
"import subprocess\n",
125125
"\n",
126126
"# Path to the SRT file we saved earlier\n",
@@ -155,8 +155,7 @@
155155
" cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE\n",
156156
")\n",
157157
"print(\"Successfully created video with embedded subtitles!\")\n",
158-
"print(f\"Output video saved to: {output_video}\")\n",
159-
"\n"
158+
"print(f\"Output video saved to: {output_video}\")"
160159
]
161160
},
162161
{

src/aws.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import boto3
44
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
5+
from pydantic import HttpUrl
56

67

78
def is_aws_configured():
@@ -41,9 +42,12 @@ def upload_to_s3(file_path, bucket_name, s3_path):
4142

4243

4344
def save_content_to_s3(content, bucket_name, s3_key, content_type):
44-
return s3_client.put_object(
45+
response = s3_client.put_object(
4546
Bucket=bucket_name,
4647
Key=s3_key,
4748
Body=content.encode("utf-8"),
4849
ContentType=content_type,
4950
)
51+
region = s3_client.meta.region_name
52+
url = f"https://{bucket_name}.s3.{region}.amazonaws.com/{s3_key}"
53+
return HttpUrl(url)

src/models/meeting.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
99
from datetime import datetime
10-
from typing import Sequence
10+
from typing import Sequence, List
1111

1212

1313
def clean_filename(meeting_name: str) -> str:
@@ -24,10 +24,10 @@ class Meeting(Dyntastic):
2424
duration: str = Field(description="Duration of the meeting")
2525
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2626
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
27-
transcript: Optional[HttpUrl] = Field(
28-
None, description="URL to the meeting transcript"
27+
transcripts: Optional[List[HttpUrl]] = Field(
28+
None, description="URLs to the meeting transcripts"
2929
)
30-
subtitles: Optional[HttpUrl] = Field(
30+
subtitles: Optional[List[HttpUrl]] = Field(
3131
None, description="URLs to the meeting subtitle tracks"
3232
)
3333

@@ -36,7 +36,7 @@ def __str__(self) -> str:
3636
return f"{self.meeting} ({self.date})"
3737

3838
def filename(self) -> str:
39-
return f"{self.clip_id}/{clean_filename(self.meeting)} ({self.date})"
39+
return f"{self.clip_id}/{clean_filename(self.meeting)}/({self.date})"
4040

4141

4242
class GranicusPlayerPage(BaseModel):

src/subtitles.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,6 @@ def chunk_transcript(
189189
"""
190190
chunks = []
191191
current_chunk = {"text": "", "start": 0, "end": 0, "speaker": "", "words": []}
192-
193192
for segment in transcript.segments:
194193
# Skip very short segments
195194
if segment.end - segment.start < 0.1:
@@ -455,7 +454,6 @@ def create_subtitles(
455454
"""
456455
# Normalize track format to TrackFormat enum internally
457456
track_format = TrackFormat(format.lower())
458-
459457
# Chunk the transcript
460458
chunks = chunk_transcript(
461459
transcript,

tasks/subtitles.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,15 @@
22
from src.subtitles import create_subtitles
33
from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
44
from pydantic import HttpUrl
5-
from src.http_utils import async_get_json
5+
66
from src.models.subtitles import TrackFormat
77

88

99
@task
1010
async def create_vtt_track(
11-
transcript_url: HttpUrl, include_speaker_prefix: bool = False
11+
transcript: Transcript, include_speaker_prefix: bool = False
1212
) -> str:
13-
transcript_data = await async_get_json(transcript_url.encoded_string())
14-
transcript = Transcript.model_validate(transcript_data)
13+
1514
vtt_track: SubtitleTrack = create_subtitles(
1615
transcript,
1716
format=TrackFormat.VTT,

tests/test_subtitles.py

Lines changed: 12 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
import pytest
66
from pathlib import Path
77
from src.subtitles import (
8-
create_track,
9-
format_time_for_srt,
8+
create_subtitles,
109
format_time_for_vtt,
11-
format_time_for_ass,
12-
load_transcript,
1310
chunk_transcript,
1411
)
1512
from src.models.subtitles import (
@@ -23,6 +20,14 @@
2320
)
2421

2522

23+
def load_transcript(fixture_transcript_path) -> Transcript:
24+
"""Load the transcript fixture."""
25+
with open(fixture_transcript_path, "r") as f:
26+
transcript_data = f.read()
27+
transcript = Transcript.model_validate_json(transcript_data)
28+
return transcript
29+
30+
2631
@pytest.fixture
2732
def fixture_transcript_path() -> Path:
2833
"""Fixture for the path to our mock transcript fixture."""
@@ -31,65 +36,13 @@ def fixture_transcript_path() -> Path:
3136

3237
@pytest.fixture
3338
def fixture_transcript(fixture_transcript_path) -> Transcript:
34-
"""Load the transcript fixture."""
3539
return load_transcript(fixture_transcript_path)
3640

3741

38-
def test_format_time_for_srt():
39-
"""Test SRT timestamp formatting."""
40-
assert format_time_for_srt(0) == "00:00:00,000"
41-
assert format_time_for_srt(3661.5) == "01:01:01,500"
42-
assert format_time_for_srt(123.456) == "00:02:03,456"
43-
44-
45-
def test_format_time_for_vtt():
46-
"""Test VTT timestamp formatting."""
47-
assert format_time_for_vtt(0) == "00:00:00.000"
48-
assert format_time_for_vtt(3661.5) == "01:01:01.500"
49-
assert format_time_for_vtt(123.456) == "00:02:03.456"
50-
51-
52-
def test_format_time_for_ass():
53-
"""Test ASS timestamp formatting."""
54-
assert format_time_for_ass(0) == "0:00:00.00"
55-
assert format_time_for_ass(3661.5) == "1:01:01.50"
56-
assert format_time_for_ass(123.456) == "0:02:03.45"
57-
58-
59-
def test_create_srt_track(fixture_transcript_path):
60-
"""Test creating an SRT track from a transcript."""
61-
srt_track = create_track(
62-
fixture_transcript_path,
63-
format="srt",
64-
max_duration=5.0,
65-
max_words=14,
66-
include_speaker_prefix=True,
67-
)
68-
69-
# Verify track metadata
70-
assert srt_track.metadata.format == TrackFormat.SRT
71-
assert srt_track.metadata.language == "en"
72-
73-
# Verify that entries were created
74-
assert len(srt_track.entries) > 0
75-
assert all(isinstance(entry, SrtEntry) for entry in srt_track.entries)
76-
77-
# Check that speaker prefixes were added
78-
assert any("[Speaker" in entry.text for entry in srt_track.entries)
79-
80-
# Test content generation using unified method
81-
srt_content = srt_track.content()
82-
assert srt_content.startswith("1\n")
83-
assert "-->" in srt_content
84-
85-
# Verify old method still works for backward compatibility
86-
assert srt_track.to_srt_content() == srt_content
87-
88-
89-
def test_create_vtt_track(fixture_transcript_path):
42+
def test_create_vtt_track(fixture_transcript):
9043
"""Test creating a VTT track from a transcript."""
91-
vtt_track = create_track(
92-
fixture_transcript_path,
44+
vtt_track = create_subtitles(
45+
fixture_transcript,
9346
format="vtt",
9447
max_duration=4.0,
9548
max_words=12,
@@ -108,53 +61,8 @@ def test_create_vtt_track(fixture_transcript_path):
10861
assert vtt_content.startswith("WEBVTT")
10962
assert "-->" in vtt_content
11063

111-
# Verify old method still works for backward compatibility
112-
assert vtt_track.to_vtt_content() == vtt_content
113-
114-
115-
def test_create_ass_track(fixture_transcript_path):
116-
"""Test creating an ASS track from a transcript."""
117-
ass_track = create_track(
118-
fixture_transcript_path,
119-
format="ass",
120-
font_size=28,
121-
bg_opacity=0.3,
122-
)
123-
124-
# Verify track metadata
125-
assert ass_track.metadata.format == TrackFormat.ASS
126-
assert ass_track.metadata.language == "en"
127-
assert ass_track.metadata.style.font_size == 28
128-
129-
# Verify that entries were created
130-
assert len(ass_track.entries) > 0
131-
assert all(isinstance(entry, AssEntry) for entry in ass_track.entries)
132-
133-
# Test content generation using unified method
134-
ass_content = ass_track.content()
135-
assert "[Script Info]" in ass_content
136-
assert "Dialogue:" in ass_content
137-
138-
# Verify old method still works for backward compatibility
139-
assert ass_track.to_ass_content() == ass_content
140-
141-
142-
def test_load_transcript(fixture_transcript_path):
143-
"""Test that a transcript can be loaded from a JSON file."""
144-
transcript = load_transcript(fixture_transcript_path)
145-
assert transcript.language == "en"
146-
assert len(transcript.segments) > 0
147-
148-
# Test at least one speaker from our fixture
149-
speakers = set(
150-
segment.speaker for segment in transcript.segments if segment.speaker
151-
)
152-
assert "SPEAKER_01" in speakers
153-
assert "SPEAKER_02" in speakers
154-
15564

15665
def test_chunk_transcript(fixture_transcript):
157-
"""Test the transcript chunking functionality."""
15866
chunks = chunk_transcript(
15967
fixture_transcript,
16068
max_duration=5.0,

0 commit comments

Comments
 (0)