Skip to content

Commit b000516

Browse files
committed
working transcript url to subtitle creation
1 parent cd49ca4 commit b000516

File tree

8 files changed

+111
-94
lines changed

8 files changed

+111
-94
lines changed

db/queries.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from datetime import datetime, timedelta
2+
from typing import Sequence
3+
from dyntastic import A
4+
from src.models.meeting import Meeting
5+
6+
7+
def get_meetings(days: int = 7) -> Sequence[Meeting]:
8+
"""
9+
Get meetings that occurred in the past number of days from now.
10+
"""
11+
now = datetime.now()
12+
target_date = now - timedelta(days=days)
13+
meetings = Meeting.scan(
14+
A.date >= target_date,
15+
)
16+
17+
return Sequence(meetings)

flows/add_subtitles.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from prefect import flow
2+
3+
from tasks.subtitles import create_vtt_track
4+
from db.queries import get_meetings
5+
6+
from src.models.subtitles import SubtitleTrack, TrackFormat
7+
from src.aws import save_content_to_s3
8+
9+
10+
@flow(log_prints=True)
11+
async def add_subtitles():
12+
meetings = get_meetings(days=90)
13+
meetings_with_transcript = [
14+
meeting
15+
for meeting in meetings
16+
if hasattr(meeting, "transcript") and meeting.transcript
17+
]
18+
for meeting in meetings_with_transcript:
19+
if not meeting.subtitles:
20+
track_content = await create_vtt_track(
21+
meeting.transcript,
22+
include_speaker_prefix=False,
23+
)
24+
save_content_to_s3(
25+
track_content,
26+
"tgov-assets",
27+
f"{meeting.filename()}.subtitles.vtt",
28+
"text/vtt",
29+
)
30+
return track_content
31+
32+
33+
if __name__ == "__main__":
34+
import asyncio
35+
36+
asyncio.run(add_subtitles())

flows/create_subtitles.py

Lines changed: 0 additions & 23 deletions
This file was deleted.

src/aws.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,23 @@
55

66

77
def is_aws_configured():
8-
required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION']
8+
required_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]
99
return all(var in os.environ for var in required_vars)
1010

1111

12+
s3_client = boto3.client("s3")
13+
14+
1215
def create_bucket_if_not_exists(bucket_name):
13-
s3 = boto3.client('s3')
14-
1516
try:
1617
# Check if the bucket exists by listing its objects
17-
s3.head_bucket(Bucket=bucket_name)
18+
s3_client.head_bucket(Bucket=bucket_name)
1819
print(f"Bucket '{bucket_name}' already exists.")
1920
except ClientError as e:
20-
if e.response['Error']['Code'] == '404':
21+
if e.response["Error"]["Code"] == "404":
2122
# Bucket does not exist, create it
2223
try:
23-
s3.create_bucket(Bucket=bucket_name)
24+
s3_client.create_bucket(Bucket=bucket_name)
2425
print(f"Bucket '{bucket_name}' created.")
2526
except ClientError as error:
2627
print(f"Failed to create bucket '{bucket_name}': {error}")
@@ -30,11 +31,19 @@ def create_bucket_if_not_exists(bucket_name):
3031

3132

3233
def upload_to_s3(file_path, bucket_name, s3_path):
33-
s3 = boto3.client('s3')
3434
try:
35-
s3.upload_file(file_path, bucket_name, s3_path)
35+
s3_client.upload_file(file_path, bucket_name, s3_path)
3636
print(f"Uploaded {file_path} to {bucket_name}/{s3_path}")
3737
return True
3838
except (NoCredentialsError, PartialCredentialsError) as e:
3939
print(f"Failed to upload to S3: {str(e)}")
4040
return False
41+
42+
43+
def save_content_to_s3(content, bucket_name, s3_key, content_type):
44+
return s3_client.put_object(
45+
Bucket=bucket_name,
46+
Key=s3_key,
47+
Body=content.encode("utf-8"),
48+
ContentType=content_type,
49+
)

src/http_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import aiohttp
2+
3+
4+
async def async_get_json(url: str):
5+
async with aiohttp.ClientSession() as session:
6+
async with session.get(url) as response:
7+
json = await response.json()
8+
return json

src/models/meeting.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
99
from datetime import datetime
10+
from typing import Sequence
1011

1112

12-
class Meeting(Dyntastic):
13-
"""
14-
Model representing a government meeting
15-
"""
13+
def clean_filename(meeting_name: str) -> str:
14+
return meeting_name.replace(" ", "_").replace("/", "_").replace(":", "_")
15+
1616

17+
class Meeting(Dyntastic):
1718
__table_name__ = "tgov-meeting"
1819
__hash_key__ = "clip_id"
1920

@@ -23,15 +24,22 @@ class Meeting(Dyntastic):
2324
duration: str = Field(description="Duration of the meeting")
2425
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2526
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
27+
transcript: Optional[HttpUrl] = Field(
28+
None, description="URL to the meeting transcript"
29+
)
30+
subtitles: Optional[HttpUrl] = Field(
31+
None, description="URLs to the meeting subtitle tracks"
32+
)
2633

2734
def __str__(self) -> str:
2835
"""String representation of the meeting"""
29-
return f"{self.meeting} - {self.date} ({self.duration})"
36+
return f"{self.meeting} ({self.date})"
3037

38+
def filename(self) -> str:
39+
return f"{self.clip_id}/{clean_filename(self.meeting)} ({self.date})"
3140

32-
class GranicusPlayerPage(BaseModel):
33-
"""Model for Granicus video URLs"""
3441

42+
class GranicusPlayerPage(BaseModel):
3543
url: HttpUrl = Field(description="Base URL of the Granicus player page")
3644
stream_url: Optional[HttpUrl] = None
3745
download_url: Optional[HttpUrl] = None

src/subtitles.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -161,35 +161,6 @@ def get_color_code_for_ass(color_name: str) -> str:
161161
return color_map.get(color_name, "FFFFFF") # Default to white if color not found
162162

163163

164-
async def request_transcript(transcript_url: HttpUrl) -> Transcript:
165-
async with aiohttp.ClientSession() as session:
166-
async with session.get(transcript_url) as response:
167-
transcript_data = await response.json()
168-
return Transcript.model_validate(transcript_data)
169-
170-
171-
async def load_transcript(
172-
transcript_data: Union[Dict[str, Any], str, Path],
173-
) -> Transcript:
174-
"""
175-
Load a transcript file or dictionary and return a validated Transcript model.
176-
177-
Args:
178-
transcript_data: Either a transcript data dictionary or path to JSON file
179-
180-
Returns:
181-
Transcript object
182-
"""
183-
# Load transcript if a path was provided
184-
if isinstance(transcript_data, (str, Path)):
185-
with open(transcript_data, "r", encoding="utf-8") as f:
186-
data = json.load(f)
187-
else:
188-
data = transcript_data
189-
190-
return Transcript.model_validate(data)
191-
192-
193164
def chunk_transcript(
194165
transcript: Transcript,
195166
max_duration: float = 5.0,
@@ -448,8 +419,8 @@ def add_speaker_prefixes(
448419
return chunks
449420

450421

451-
def create_track(
452-
transcript_data: Union[Dict[str, Any], str, Path],
422+
def create_subtitles(
423+
transcript: Transcript,
453424
format: str = "srt",
454425
max_duration: float = 5.0,
455426
max_length: int = 80,
@@ -485,14 +456,6 @@ def create_track(
485456
# Normalize track format to TrackFormat enum internally
486457
track_format = TrackFormat(format.lower())
487458

488-
# Load and validate transcript
489-
transcript = load_transcript(transcript_data)
490-
491-
# Generate source file information if transcript_data is a path
492-
source_file = None
493-
if isinstance(transcript_data, (str, Path)):
494-
source_file = str(transcript_data)
495-
496459
# Chunk the transcript
497460
chunks = chunk_transcript(
498461
transcript,
@@ -545,7 +508,6 @@ def create_track(
545508
speakers=speakers,
546509
word_count=word_count,
547510
duration=track_duration,
548-
source_file=source_file,
549511
style=AssStyle(
550512
font_name=font_name,
551513
font_size=font_size,

tasks/subtitles.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
from prefect import task
2-
from src.subtitles import create_track, request_transcript
3-
from src.models.subtitles import SubtitleTrack, TrackFormat
4-
from pydantic import BaseModel, HttpUrl
5-
import base64
6-
7-
8-
class CreateTrackParams(BaseModel):
9-
transcript_url: HttpUrl
10-
format: TrackFormat
11-
include_speaker_prefix: bool = False
2+
from src.subtitles import create_subtitles
3+
from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
4+
from pydantic import HttpUrl
5+
from src.http_utils import async_get_json
6+
from src.models.subtitles import TrackFormat
127

138

149
@task
15-
async def create_track(params: CreateTrackParams) -> str:
16-
transcript = await request_transcript(params.transcript_url)
17-
vtt_track: SubtitleTrack = create_track(
18-
transcript, params.format, params.include_speaker_prefix
10+
async def create_vtt_track(
11+
transcript_url: HttpUrl, include_speaker_prefix: bool = False
12+
) -> str:
13+
transcript_data = await async_get_json(transcript_url.encoded_string())
14+
transcript = Transcript.model_validate(transcript_data)
15+
vtt_track: SubtitleTrack = create_subtitles(
16+
transcript,
17+
format=TrackFormat.VTT,
18+
include_speaker_prefix=include_speaker_prefix,
1919
)
2020
vtt_content = vtt_track.content()
21-
vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
22-
return vtt_data
21+
# vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
22+
return vtt_content

0 commit comments

Comments
 (0)