Skip to content

Commit a95c8b8

Browse files
committed
Merge branch 'main' into deploy-lambda
2 parents a14c8de + 51c8bf5 commit a95c8b8

File tree

10 files changed

+157
-158
lines changed

10 files changed

+157
-158
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
launch.json
2+
13
# Environment variables
24
.env
35
.envrc

db/queries.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from datetime import datetime, timedelta
2+
from typing import Sequence
3+
from dyntastic import A
4+
from src.models.meeting import Meeting
5+
6+
7+
def get_meetings(days: int = 7) -> Sequence[Meeting]:
8+
"""
9+
Get meetings that occurred in the past number of days from now.
10+
"""
11+
now = datetime.now()
12+
target_date = now - timedelta(days=days)
13+
meetings = Meeting.scan(
14+
A.date >= target_date,
15+
)
16+
17+
return list(meetings)

flows/add_subtitles.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from prefect import flow
2+
from dyntastic import A
3+
from pydantic import HttpUrl
4+
5+
from src.models.subtitles import Transcript
6+
7+
from tasks.subtitles import create_vtt_track
8+
from db.queries import get_meetings
9+
from src.aws import save_content_to_s3
10+
from src.http_utils import async_get_json
11+
12+
13+
@flow(log_prints=True)
14+
async def add_subtitles():
15+
meetings = get_meetings(days=90)
16+
meetings_with_transcripts = [
17+
meeting
18+
for meeting in meetings
19+
if hasattr(meeting, "transcripts") and meeting.transcripts is not None or []
20+
]
21+
for meeting in meetings_with_transcripts:
22+
for transcript_url in meeting.transcripts:
23+
transcript_data = await async_get_json(transcript_url.encoded_string())
24+
transcript = Transcript.model_validate(transcript_data)
25+
language = transcript.language
26+
if f"{language}.vtt" in meeting.subtitles:
27+
continue
28+
track_content = await create_vtt_track(
29+
transcript,
30+
include_speaker_prefix=False,
31+
)
32+
result: HttpUrl = save_content_to_s3(
33+
track_content,
34+
"tgov-assets",
35+
f"{meeting.filename()}/subtitles/{language}.vtt",
36+
"text/vtt",
37+
)
38+
if not meeting.subtitles:
39+
meeting.subtitles = [result]
40+
else:
41+
(
42+
meeting.subtitles.append(result)
43+
if result not in meeting.subtitles
44+
else None
45+
)
46+
meeting.save()
47+
48+
49+
if __name__ == "__main__":
50+
import asyncio
51+
52+
asyncio.run(add_subtitles())

notebooks/srt_subtitles.ipynb

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"sys.path.append(\"../\")\n",
2929
"\n",
3030
"# Import the necessary modules from the new subtitles package\n",
31-
"from src.subtitles import create_track"
31+
"from src.subtitles import create_subtitles"
3232
]
3333
},
3434
{
@@ -51,9 +51,9 @@
5151
" \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
5252
")\n",
5353
"# Create an SRT track from the transcript\n",
54-
"srt_track = create_track(\n",
54+
"srt_track = create_subtitles(\n",
5555
" transcript_data=transcript_file,\n",
56-
" format='srt',\n",
56+
" format=\"srt\",\n",
5757
" max_duration=5.0, # Maximum duration for each subtitle\n",
5858
" max_length=80, # Maximum length in characters\n",
5959
" include_speaker_prefix=True, # Include speaker labels\n",
@@ -120,7 +120,7 @@
120120
"outputs": [],
121121
"source": [
122122
"# Import the subtitles module\n",
123-
"from src.subtitles import create_track\n",
123+
"from src.subtitles import create_subtitles\n",
124124
"import subprocess\n",
125125
"\n",
126126
"# Path to the SRT file we saved earlier\n",
@@ -155,8 +155,7 @@
155155
" cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE\n",
156156
")\n",
157157
"print(\"Successfully created video with embedded subtitles!\")\n",
158-
"print(f\"Output video saved to: {output_video}\")\n",
159-
"\n"
158+
"print(f\"Output video saved to: {output_video}\")"
160159
]
161160
},
162161
{

src/aws.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,25 +2,27 @@
22

33
import boto3
44
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
5+
from pydantic import HttpUrl
56

67

78
def is_aws_configured():
8-
required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION']
9+
required_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]
910
return all(var in os.environ for var in required_vars)
1011

1112

13+
s3_client = boto3.client("s3")
14+
15+
1216
def create_bucket_if_not_exists(bucket_name):
13-
s3 = boto3.client('s3')
14-
1517
try:
1618
# Check if the bucket exists by listing its objects
17-
s3.head_bucket(Bucket=bucket_name)
19+
s3_client.head_bucket(Bucket=bucket_name)
1820
print(f"Bucket '{bucket_name}' already exists.")
1921
except ClientError as e:
20-
if e.response['Error']['Code'] == '404':
22+
if e.response["Error"]["Code"] == "404":
2123
# Bucket does not exist, create it
2224
try:
23-
s3.create_bucket(Bucket=bucket_name)
25+
s3_client.create_bucket(Bucket=bucket_name)
2426
print(f"Bucket '{bucket_name}' created.")
2527
except ClientError as error:
2628
print(f"Failed to create bucket '{bucket_name}': {error}")
@@ -30,11 +32,22 @@ def create_bucket_if_not_exists(bucket_name):
3032

3133

3234
def upload_to_s3(file_path, bucket_name, s3_path):
33-
s3 = boto3.client('s3')
3435
try:
35-
s3.upload_file(file_path, bucket_name, s3_path)
36+
s3_client.upload_file(file_path, bucket_name, s3_path)
3637
print(f"Uploaded {file_path} to {bucket_name}/{s3_path}")
3738
return True
3839
except (NoCredentialsError, PartialCredentialsError) as e:
3940
print(f"Failed to upload to S3: {str(e)}")
4041
return False
42+
43+
44+
def save_content_to_s3(content, bucket_name, s3_key, content_type):
45+
response = s3_client.put_object(
46+
Bucket=bucket_name,
47+
Key=s3_key,
48+
Body=content.encode("utf-8"),
49+
ContentType=content_type,
50+
)
51+
region = s3_client.meta.region_name
52+
url = f"https://{bucket_name}.s3.{region}.amazonaws.com/{s3_key}"
53+
return HttpUrl(url)

src/http_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import aiohttp
2+
3+
4+
async def async_get_json(url: str):
5+
async with aiohttp.ClientSession() as session:
6+
async with session.get(url) as response:
7+
json = await response.json()
8+
return json

src/models/meeting.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
from dyntastic import Dyntastic
88
from pydantic import BaseModel, Field, HttpUrl
99
from datetime import datetime
10+
from typing import Sequence, List
1011

1112

12-
class Meeting(Dyntastic):
13-
"""
14-
Model representing a government meeting
15-
"""
13+
def clean_filename(meeting_name: str) -> str:
14+
return meeting_name.replace(" ", "_").replace("/", "_").replace(":", "_")
15+
1616

17+
class Meeting(Dyntastic):
1718
__table_name__ = "tgov-meeting"
1819
__hash_key__ = "clip_id"
1920

@@ -23,15 +24,22 @@ class Meeting(Dyntastic):
2324
duration: str = Field(description="Duration of the meeting")
2425
agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
2526
video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
27+
transcripts: Optional[List[HttpUrl]] = Field(
28+
None, description="URLs to the meeting transcripts"
29+
)
30+
subtitles: Optional[List[HttpUrl]] = Field(
31+
None, description="URLs to the meeting subtitle tracks"
32+
)
2633

2734
def __str__(self) -> str:
2835
"""String representation of the meeting"""
29-
return f"{self.meeting} - {self.date} ({self.duration})"
36+
return f"{self.meeting} ({self.date})"
3037

38+
def filename(self) -> str:
39+
return f"{self.clip_id}/{clean_filename(self.meeting)}/({self.date})"
3140

32-
class GranicusPlayerPage(BaseModel):
33-
"""Model for Granicus video URLs"""
3441

42+
class GranicusPlayerPage(BaseModel):
3543
url: HttpUrl = Field(description="Base URL of the Granicus player page")
3644
stream_url: Optional[HttpUrl] = None
3745
download_url: Optional[HttpUrl] = None

src/subtitles.py

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from pathlib import Path
1313
from typing import Dict, List, Optional, Union, Any
1414
from datetime import datetime, timedelta
15+
import aiohttp
16+
from pydantic import HttpUrl
1517

1618
# Import models from the models module
1719
from src.models.subtitles import (
@@ -159,26 +161,6 @@ def get_color_code_for_ass(color_name: str) -> str:
159161
return color_map.get(color_name, "FFFFFF") # Default to white if color not found
160162

161163

162-
def load_transcript(transcript_data: Union[Dict[str, Any], str, Path]) -> Transcript:
163-
"""
164-
Load a transcript file or dictionary and return a validated Transcript model.
165-
166-
Args:
167-
transcript_data: Either a transcript data dictionary or path to JSON file
168-
169-
Returns:
170-
Transcript object
171-
"""
172-
# Load transcript if a path was provided
173-
if isinstance(transcript_data, (str, Path)):
174-
with open(transcript_data, "r", encoding="utf-8") as f:
175-
data = json.load(f)
176-
else:
177-
data = transcript_data
178-
179-
return Transcript.model_validate(data)
180-
181-
182164
def chunk_transcript(
183165
transcript: Transcript,
184166
max_duration: float = 5.0,
@@ -207,7 +189,6 @@ def chunk_transcript(
207189
"""
208190
chunks = []
209191
current_chunk = {"text": "", "start": 0, "end": 0, "speaker": "", "words": []}
210-
211192
for segment in transcript.segments:
212193
# Skip very short segments
213194
if segment.end - segment.start < 0.1:
@@ -437,8 +418,8 @@ def add_speaker_prefixes(
437418
return chunks
438419

439420

440-
def create_track(
441-
transcript_data: Union[Dict[str, Any], str, Path],
421+
def create_subtitles(
422+
transcript: Transcript,
442423
format: str = "srt",
443424
max_duration: float = 5.0,
444425
max_length: int = 80,
@@ -473,15 +454,6 @@ def create_track(
473454
"""
474455
# Normalize track format to TrackFormat enum internally
475456
track_format = TrackFormat(format.lower())
476-
477-
# Load and validate transcript
478-
transcript = load_transcript(transcript_data)
479-
480-
# Generate source file information if transcript_data is a path
481-
source_file = None
482-
if isinstance(transcript_data, (str, Path)):
483-
source_file = str(transcript_data)
484-
485457
# Chunk the transcript
486458
chunks = chunk_transcript(
487459
transcript,
@@ -534,7 +506,6 @@ def create_track(
534506
speakers=speakers,
535507
word_count=word_count,
536508
duration=track_duration,
537-
source_file=source_file,
538509
style=AssStyle(
539510
font_name=font_name,
540511
font_size=font_size,

tasks/subtitles.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from prefect import task
2+
from src.subtitles import create_subtitles
3+
from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
4+
from pydantic import HttpUrl
5+
6+
from src.models.subtitles import TrackFormat
7+
8+
9+
@task
10+
async def create_vtt_track(
11+
transcript: Transcript, include_speaker_prefix: bool = False
12+
) -> str:
13+
14+
vtt_track: SubtitleTrack = create_subtitles(
15+
transcript,
16+
format=TrackFormat.VTT,
17+
include_speaker_prefix=include_speaker_prefix,
18+
)
19+
vtt_content = vtt_track.content()
20+
# vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
21+
return vtt_content

0 commit comments

Comments
 (0)