Merge branch 'main' into deploy-lambda

kendallwerts · kendallwerts · commit a95c8b84d3e0 · 2025-05-26T16:14:26.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+launch.json
+
 # Environment variables
 .env
 .envrc
diff --git a/db/queries.py b/db/queries.py
@@ -0,0 +1,17 @@
+from datetime import datetime, timedelta
+from typing import Sequence
+from dyntastic import A
+from src.models.meeting import Meeting
+
+
+def get_meetings(days: int = 7) -> Sequence[Meeting]:
+    """
+    Get meetings that occurred in the past number of days from now.
+    """
+    now = datetime.now()
+    target_date = now - timedelta(days=days)
+    meetings = Meeting.scan(
+        A.date >= target_date,
+    )
+
+    return list(meetings)
diff --git a/flows/add_subtitles.py b/flows/add_subtitles.py
@@ -0,0 +1,52 @@
+from prefect import flow
+from dyntastic import A
+from pydantic import HttpUrl
+
+from src.models.subtitles import Transcript
+
+from tasks.subtitles import create_vtt_track
+from db.queries import get_meetings
+from src.aws import save_content_to_s3
+from src.http_utils import async_get_json
+
+
+@flow(log_prints=True)
+async def add_subtitles():
+    meetings = get_meetings(days=90)
+    meetings_with_transcripts = [
+        meeting
+        for meeting in meetings
+        if hasattr(meeting, "transcripts") and meeting.transcripts is not None or []
+    ]
+    for meeting in meetings_with_transcripts:
+        for transcript_url in meeting.transcripts:
+            transcript_data = await async_get_json(transcript_url.encoded_string())
+            transcript = Transcript.model_validate(transcript_data)
+            language = transcript.language
+            if f"{language}.vtt" in meeting.subtitles:
+                continue
+            track_content = await create_vtt_track(
+                transcript,
+                include_speaker_prefix=False,
+            )
+            result: HttpUrl = save_content_to_s3(
+                track_content,
+                "tgov-assets",
+                f"{meeting.filename()}/subtitles/{language}.vtt",
+                "text/vtt",
+            )
+            if not meeting.subtitles:
+                meeting.subtitles = [result]
+            else:
+                (
+                    meeting.subtitles.append(result)
+                    if result not in meeting.subtitles
+                    else None
+                )
+            meeting.save()
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(add_subtitles())
diff --git a/notebooks/srt_subtitles.ipynb b/notebooks/srt_subtitles.ipynb
@@ -28,7 +28,7 @@
     "sys.path.append(\"../\")\n",
     "\n",
     "# Import the necessary modules from the new subtitles package\n",
-    "from src.subtitles import create_track"
+    "from src.subtitles import create_subtitles"
    ]
   },
   {
@@ -51,9 +51,9 @@
     "    \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
     ")\n",
     "# Create an SRT track from the transcript\n",
-    "srt_track = create_track(\n",
+    "srt_track = create_subtitles(\n",
     "    transcript_data=transcript_file,\n",
-    "    format='srt',\n",
+    "    format=\"srt\",\n",
     "    max_duration=5.0,  # Maximum duration for each subtitle\n",
     "    max_length=80,  # Maximum length in characters\n",
     "    include_speaker_prefix=True,  # Include speaker labels\n",
@@ -120,7 +120,7 @@
    "outputs": [],
    "source": [
     "# Import the subtitles module\n",
-    "from src.subtitles import create_track\n",
+    "from src.subtitles import create_subtitles\n",
     "import subprocess\n",
     "\n",
     "# Path to the SRT file we saved earlier\n",
@@ -155,8 +155,7 @@
     "    cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE\n",
     ")\n",
     "print(\"Successfully created video with embedded subtitles!\")\n",
-    "print(f\"Output video saved to: {output_video}\")\n",
-    "\n"
+    "print(f\"Output video saved to: {output_video}\")"
    ]
   },
   {
diff --git a/src/aws.py b/src/aws.py
@@ -2,25 +2,27 @@
 
 import boto3
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError
+from pydantic import HttpUrl
 
 
 def is_aws_configured():
-    required_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_DEFAULT_REGION']
+    required_vars = ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_DEFAULT_REGION"]
     return all(var in os.environ for var in required_vars)
 
 
+s3_client = boto3.client("s3")
+
+
 def create_bucket_if_not_exists(bucket_name):
-    s3 = boto3.client('s3')
-    
     try:
         # Check if the bucket exists by listing its objects
-        s3.head_bucket(Bucket=bucket_name)
+        s3_client.head_bucket(Bucket=bucket_name)
         print(f"Bucket '{bucket_name}' already exists.")
     except ClientError as e:
-        if e.response['Error']['Code'] == '404':
+        if e.response["Error"]["Code"] == "404":
             # Bucket does not exist, create it
             try:
-                s3.create_bucket(Bucket=bucket_name)
+                s3_client.create_bucket(Bucket=bucket_name)
                 print(f"Bucket '{bucket_name}' created.")
             except ClientError as error:
                 print(f"Failed to create bucket '{bucket_name}': {error}")
@@ -30,11 +32,22 @@ def create_bucket_if_not_exists(bucket_name):
 
 
 def upload_to_s3(file_path, bucket_name, s3_path):
-    s3 = boto3.client('s3')
     try:
-        s3.upload_file(file_path, bucket_name, s3_path)
+        s3_client.upload_file(file_path, bucket_name, s3_path)
         print(f"Uploaded {file_path} to {bucket_name}/{s3_path}")
         return True
     except (NoCredentialsError, PartialCredentialsError) as e:
         print(f"Failed to upload to S3: {str(e)}")
         return False
+
+
+def save_content_to_s3(content, bucket_name, s3_key, content_type):
+    response = s3_client.put_object(
+        Bucket=bucket_name,
+        Key=s3_key,
+        Body=content.encode("utf-8"),
+        ContentType=content_type,
+    )
+    region = s3_client.meta.region_name
+    url = f"https://{bucket_name}.s3.{region}.amazonaws.com/{s3_key}"
+    return HttpUrl(url)
diff --git a/src/http_utils.py b/src/http_utils.py
@@ -0,0 +1,8 @@
+import aiohttp
+
+
+async def async_get_json(url: str):
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            json = await response.json()
+    return json
diff --git a/src/models/meeting.py b/src/models/meeting.py
@@ -7,13 +7,14 @@
 from dyntastic import Dyntastic
 from pydantic import BaseModel, Field, HttpUrl
 from datetime import datetime
+from typing import Sequence, List
 
 
-class Meeting(Dyntastic):
-    """
-    Model representing a government meeting
-    """
+def clean_filename(meeting_name: str) -> str:
+    return meeting_name.replace(" ", "_").replace("/", "_").replace(":", "_")
+
 
+class Meeting(Dyntastic):
     __table_name__ = "tgov-meeting"
     __hash_key__ = "clip_id"
 
@@ -23,15 +24,22 @@ class Meeting(Dyntastic):
     duration: str = Field(description="Duration of the meeting")
     agenda: Optional[HttpUrl] = Field(None, description="URL to the meeting agenda")
     video: Optional[HttpUrl] = Field(None, description="URL to the meeting video")
+    transcripts: Optional[List[HttpUrl]] = Field(
+        None, description="URLs to the meeting transcripts"
+    )
+    subtitles: Optional[List[HttpUrl]] = Field(
+        None, description="URLs to the meeting subtitle tracks"
+    )
 
     def __str__(self) -> str:
         """String representation of the meeting"""
-        return f"{self.meeting} - {self.date} ({self.duration})"
+        return f"{self.meeting} ({self.date})"
 
+    def filename(self) -> str:
+        return f"{self.clip_id}/{clean_filename(self.meeting)}/({self.date})"
 
-class GranicusPlayerPage(BaseModel):
-    """Model for Granicus video URLs"""
 
+class GranicusPlayerPage(BaseModel):
     url: HttpUrl = Field(description="Base URL of the Granicus player page")
     stream_url: Optional[HttpUrl] = None
     download_url: Optional[HttpUrl] = None
diff --git a/src/subtitles.py b/src/subtitles.py
@@ -12,6 +12,8 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Union, Any
 from datetime import datetime, timedelta
+import aiohttp
+from pydantic import HttpUrl
 
 # Import models from the models module
 from src.models.subtitles import (
@@ -159,26 +161,6 @@ def get_color_code_for_ass(color_name: str) -> str:
     return color_map.get(color_name, "FFFFFF")  # Default to white if color not found
 
 
-def load_transcript(transcript_data: Union[Dict[str, Any], str, Path]) -> Transcript:
-    """
-    Load a transcript file or dictionary and return a validated Transcript model.
-
-    Args:
-        transcript_data: Either a transcript data dictionary or path to JSON file
-
-    Returns:
-        Transcript object
-    """
-    # Load transcript if a path was provided
-    if isinstance(transcript_data, (str, Path)):
-        with open(transcript_data, "r", encoding="utf-8") as f:
-            data = json.load(f)
-    else:
-        data = transcript_data
-
-    return Transcript.model_validate(data)
-
-
 def chunk_transcript(
     transcript: Transcript,
     max_duration: float = 5.0,
@@ -207,7 +189,6 @@ def chunk_transcript(
     """
     chunks = []
     current_chunk = {"text": "", "start": 0, "end": 0, "speaker": "", "words": []}
-
     for segment in transcript.segments:
         # Skip very short segments
         if segment.end - segment.start < 0.1:
@@ -437,8 +418,8 @@ def add_speaker_prefixes(
     return chunks
 
 
-def create_track(
-    transcript_data: Union[Dict[str, Any], str, Path],
+def create_subtitles(
+    transcript: Transcript,
     format: str = "srt",
     max_duration: float = 5.0,
     max_length: int = 80,
@@ -473,15 +454,6 @@ def create_track(
     """
     # Normalize track format to TrackFormat enum internally
     track_format = TrackFormat(format.lower())
-
-    # Load and validate transcript
-    transcript = load_transcript(transcript_data)
-
-    # Generate source file information if transcript_data is a path
-    source_file = None
-    if isinstance(transcript_data, (str, Path)):
-        source_file = str(transcript_data)
-
     # Chunk the transcript
     chunks = chunk_transcript(
         transcript,
@@ -534,7 +506,6 @@ def create_track(
         speakers=speakers,
         word_count=word_count,
         duration=track_duration,
-        source_file=source_file,
         style=AssStyle(
             font_name=font_name,
             font_size=font_size,
diff --git a/tasks/subtitles.py b/tasks/subtitles.py
@@ -0,0 +1,21 @@
+from prefect import task
+from src.subtitles import create_subtitles
+from src.models.subtitles import SubtitleTrack, TrackFormat, Transcript
+from pydantic import HttpUrl
+
+from src.models.subtitles import TrackFormat
+
+
+@task
+async def create_vtt_track(
+    transcript: Transcript, include_speaker_prefix: bool = False
+) -> str:
+
+    vtt_track: SubtitleTrack = create_subtitles(
+        transcript,
+        format=TrackFormat.VTT,
+        include_speaker_prefix=include_speaker_prefix,
+    )
+    vtt_content = vtt_track.content()
+    # vtt_data = f"data:text/vtt;charset=utf-8;base64,{base64.b64encode(vtt_content.encode('utf-8')).decode('ascii')}"
+    return vtt_content
diff --git a/tests/test_subtitles.py b/tests/test_subtitles.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+launch.json`
	`2`	`+`
`1`	`3`	`# Environment variables`
`2`	`4`	`.env`
`3`	`5`	`.envrc`