working transcript url to subtitle creation

jdungan · jdungan · commit b000516da48b · 2025-04-09T13:38:05.000-05:00
diff --git a/db/queries.py b/db/queries.py
@@ -0,0 +1,17 @@
+from datetime import datetime, timedelta
+from typing import Sequence
+from dyntastic import A
+from src.models.meeting import Meeting
+
+
+def get_meetings(days: int = 7) -> Sequence[Meeting]:
+    """
+    Get meetings that occurred in the past number of days from now.
+    """
+    now = datetime.now()
+    target_date = now - timedelta(days=days)
+    meetings = Meeting.scan(
+        A.date >= target_date,
+    )
+
+    return Sequence(meetings)
diff --git a/flows/add_subtitles.py b/flows/add_subtitles.py
@@ -0,0 +1,36 @@
+from prefect import flow
+
+from tasks.subtitles import create_vtt_track
+from db.queries import get_meetings
+
+from src.models.subtitles import SubtitleTrack, TrackFormat
+from src.aws import save_content_to_s3
+
+
+@flow(log_prints=True)
+async def add_subtitles():
+    meetings = get_meetings(days=90)
+    meetings_with_transcript = [
+        meeting
+        for meeting in meetings
+        if hasattr(meeting, "transcript") and meeting.transcript
+    ]
+    for meeting in meetings_with_transcript:
+        if not meeting.subtitles:
+            track_content = await create_vtt_track(
+                meeting.transcript,
+                include_speaker_prefix=False,
+            )
+            save_content_to_s3(
+                track_content,
+                "tgov-assets",
+                f"{meeting.filename()}.subtitles.vtt",
+                "text/vtt",
+            )
+    return track_content
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(add_subtitles())
diff --git a/flows/create_subtitles.py b/flows/create_subtitles.py
diff --git a/src/aws.py b/src/aws.py
@@ -5,22 +5,23 @@
 
 
 def is_aws_configured():
-    required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION']
+    required_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]
     return all(var in os.environ for var in required_vars)
 
 
+s3_client = boto3.client("s3")
+
+
 def create_bucket_if_not_exists(bucket_name):
-    s3 = boto3.client('s3')
-    
     try:
         # Check if the bucket exists by listing its objects
-        s3.head_bucket(Bucket=bucket_name)
+        s3_client.head_bucket(Bucket=bucket_name)
         print(f"Bucket '{bucket_name}' already exists.")
     except ClientError as e:
-        if e.response['Error']['Code'] == '404':
+        if e.response["Error"]["Code"] == "404":
             # Bucket does not exist, create it
             try:
-                s3.create_bucket(Bucket=bucket_name)
+                s3_client.create_bucket(Bucket=bucket_name)
                 print(f"Bucket '{bucket_name}' created.")
             except ClientError as error:
                 print(f"Failed to create bucket '{bucket_name}': {error}")
@@ -30,11 +31,19 @@ def create_bucket_if_not_exists(bucket_name):
 
 
 def upload_to_s3(file_path, bucket_name, s3_path):
-    s3 = boto3.client('s3')
     try:
-        s3.upload_file(file_path, bucket_name, s3_path)
+        s3_client.upload_file(file_path, bucket_name, s3_path)
         print(f"Uploaded {file_path} to {bucket_name}/{s3_path}")
         return True
     except (NoCredentialsError, PartialCredentialsError) as e:
         print(f"Failed to upload to S3: {str(e)}")
         return False
+
+
+def save_content_to_s3(content, bucket_name, s3_key, content_type):
+    return s3_client.put_object(
+        Bucket=bucket_name,
+        Key=s3_key,
+        Body=content.encode("utf-8"),
+        ContentType=content_type,
+    )
diff --git a/src/http_utils.py b/src/http_utils.py
@@ -0,0 +1,8 @@
+import aiohttp
+
+
+async def async_get_json(url: str):
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            json = await response.json()
+    return json
diff --git a/src/models/meeting.py b/src/models/meeting.py
@@ -7,13 +7,14 @@
 from dyntastic import Dyntastic
 from pydantic import BaseModel, Field, HttpUrl
 from datetime import datetime
+from typing import Sequence
 
 
-class Meeting(Dyntastic):
-    """
-    Model representing a government meeting
-    """
+def clean_filename(meeting_name: str) -> str:
+    return meeting_name.replace(" ", "_").replace("/", "_").replace(":", "_")
+
 
+class Meeting(Dyntastic):
     __table_name__ = "tgov-meeting"
     __hash_key__ = "clip_id"
 
@@ -23,15 +24,22 @@ class Meeting(Dyntastic):
     duration: str = Field(description="Duration of the meeting")
     agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
     video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
+    transcript: Optional[HttpUrl] = Field(
+        None, description="URL to the meeting transcript"
+    )
+    subtitles: Optional[HttpUrl] = Field(
+        None, description="URLs to the meeting subtitle tracks"
+    )
 
     def __str__(self) -> str:
         """String representation of the meeting"""
-        return f"{self.meeting} - {self.date} ({self.duration})"
+        return f"{self.meeting} ({self.date})"
 
+    def filename(self) -> str:
+        return f"{self.clip_id}/{clean_filename(self.meeting)} ({self.date})"
 
-class GranicusPlayerPage(BaseModel):
-    """Model for Granicus video URLs"""
 
+class GranicusPlayerPage(BaseModel):
     url: HttpUrl = Field(description="Base URL of the Granicus player page")
     stream_url: Optional[HttpUrl] = None
     download_url: Optional[HttpUrl] = None
diff --git a/src/subtitles.py b/src/subtitles.py
@@ -161,35 +161,6 @@ def get_color_code_for_ass(color_name: str) -> str:
     return color_map.get(color_name, "FFFFFF")  # Default to white if color not found
 
 
-async def request_transcript(transcript_url: HttpUrl) -> Transcript:
-    async with aiohttp.ClientSession() as session:
-        async with session.get(transcript_url) as response:
-            transcript_data = await response.json()
-    return Transcript.model_validate(transcript_data)
-
-
-async def load_transcript(
-    transcript_data: Union[Dict[str, Any], str, Path],
-) -> Transcript:
-    """
-    Load a transcript file or dictionary and return a validated Transcript model.
-
-    Args:
-        transcript_data: Either a transcript data dictionary or path to JSON file
-
-    Returns:
-        Transcript object
-    """
-    # Load transcript if a path was provided
-    if isinstance(transcript_data, (str, Path)):
-        with open(transcript_data, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    else:
-        data = transcript_data
-
-    return Transcript.model_validate(data)
-
-
 def chunk_transcript(
     transcript: Transcript,
     max_duration: float = 5.0,
@@ -448,8 +419,8 @@ def add_speaker_prefixes(
     return chunks
 
 
-def create_track(
-    transcript_data: Union[Dict[str, Any], str, Path],
+def create_subtitles(
+    transcript: Transcript,
     format: str = "srt",
     max_duration: float = 5.0,
     max_length: int = 80,
@@ -485,14 +456,6 @@ def create_track(
     # Normalize track format to TrackFormat enum internally
     track_format = TrackFormat(format.lower())
 
-    # Load and validate transcript
-    transcript = load_transcript(transcript_data)
-
-    # Generate source file information if transcript_data is a path
-    source_file = None
-    if isinstance(transcript_data, (str, Path)):
-        source_file = str(transcript_data)
-
     # Chunk the transcript
     chunks = chunk_transcript(
         transcript,
@@ -545,7 +508,6 @@ def create_track(
         speakers=speakers,
         word_count=word_count,
         duration=track_duration,
-        source_file=source_file,
         style=AssStyle(
             font_name=font_name,
             font_size=font_size,
diff --git a/tasks/subtitles.py b/tasks/subtitles.py
@@ -1,22 +1,22 @@
 from prefect import task
-from src.subtitles import create_track, request_transcript
-from src.models.subtitles import SubtitleTrack, TrackFormat
-from pydantic import BaseModel, HttpUrl
-import base64
-
-
-class CreateTrackParams(BaseModel):
-    transcript_url: HttpUrl
-    format: TrackFormat
-    include_speaker_prefix: bool = False
+from src.subtitles import create_subtitles
+from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
+from pydantic import HttpUrl
+from src.http_utils import async_get_json
+from src.models.subtitles import TrackFormat
 
 
 @task
-async def create_track(params: CreateTrackParams) -> str:
-    transcript = await request_transcript(params.transcript_url)
-    vtt_track: SubtitleTrack = create_track(
-        transcript, params.format, params.include_speaker_prefix
+async def create_vtt_track(
+    transcript_url: HttpUrl, include_speaker_prefix: bool = False
+) -> str:
+    transcript_data = await async_get_json(transcript_url.encoded_string())
+    transcript = Transcript.model_validate(transcript_data)
+    vtt_track: SubtitleTrack = create_subtitles(
+        transcript,
+        format=TrackFormat.VTT,
+        include_speaker_prefix=include_speaker_prefix,
     )
     vtt_content = vtt_track.content()
-    vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
-    return vtt_data
+    # vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
+    return vtt_content